Repository: open-mmlab/mmcv Branch: main Commit: a8073c74bf83 Files: 857 Total size: 4.7 MB Directory structure: gitextract_pswz5bz1/ ├── .dev_scripts/ │ └── check_installation.py ├── .dockerignore ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── 1-bug-report.yml │ │ ├── 2-feature_request.yml │ │ ├── 3-documentation.yml │ │ └── config.yml │ ├── pull_request_template.md │ └── workflows/ │ ├── build_macos_wheel.yml │ ├── lint.yml │ ├── merge_stage_test.yml │ ├── pr_stage_test.yml │ └── publish-to-pypi.yml ├── .gitignore ├── .pre-commit-config-zh-cn.yaml ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CITATION.cff ├── CONTRIBUTING.md ├── CONTRIBUTING_zh-CN.md ├── LICENSE ├── LICENSES.md ├── MANIFEST.in ├── README.md ├── README_zh-CN.md ├── TERMINOLOGY.md ├── docker/ │ ├── README.md │ ├── dev/ │ │ └── Dockerfile │ └── release/ │ └── Dockerfile ├── docs/ │ ├── en/ │ │ ├── Makefile │ │ ├── _static/ │ │ │ ├── css/ │ │ │ │ └── readthedocs.css │ │ │ └── version.json │ │ ├── _templates/ │ │ │ └── classtemplate.rst │ │ ├── api/ │ │ │ ├── arraymisc.rst │ │ │ ├── cnn.rst │ │ │ ├── image.rst │ │ │ ├── ops.rst │ │ │ ├── transforms.rst │ │ │ ├── utils.rst │ │ │ ├── video.rst │ │ │ └── visualization.rst │ │ ├── community/ │ │ │ ├── contributing.md │ │ │ └── pr.md │ │ ├── compatibility.md │ │ ├── conf.py │ │ ├── deployment/ │ │ │ └── mmcv_ops_definition.md │ │ ├── docutils.conf │ │ ├── faq.md │ │ ├── get_started/ │ │ │ ├── api_reference.md │ │ │ ├── build.md │ │ │ ├── installation.md │ │ │ ├── introduction.md │ │ │ └── previous_versions.md │ │ ├── index.rst │ │ ├── make.bat │ │ ├── switch_language.md │ │ └── understand_mmcv/ │ │ ├── cnn.md │ │ ├── data_process.md │ │ ├── data_transform.md │ │ ├── ops.md │ │ └── visualization.md │ └── zh_cn/ │ ├── Makefile │ ├── _static/ │ │ ├── css/ │ │ │ └── readthedocs.css │ │ └── version.json │ ├── _templates/ │ │ └── classtemplate.rst │ ├── api/ │ │ ├── arraymisc.rst │ │ ├── cnn.rst │ │ ├── image.rst │ │ ├── ops.rst │ │ ├── transforms.rst │ │ ├── utils.rst │ │ ├── video.rst │ │ └── visualization.rst │ ├── community/ │ │ ├── code_style.md │ │ ├── contributing.md │ │ └── pr.md │ ├── compatibility.md │ ├── conf.py │ ├── docutils.conf │ ├── faq.md │ ├── get_started/ │ │ ├── api_reference.md │ │ ├── article.md │ │ ├── build.md │ │ ├── installation.md │ │ ├── introduction.md │ │ └── previous_versions.md │ ├── index.rst │ ├── make.bat │ ├── switch_language.md │ └── understand_mmcv/ │ ├── cnn.md │ ├── data_process.md │ ├── data_transform.md │ ├── ops.md │ └── visualization.md ├── mmcv/ │ ├── __init__.py │ ├── arraymisc/ │ │ ├── __init__.py │ │ └── quantization.py │ ├── cnn/ │ │ ├── __init__.py │ │ ├── alexnet.py │ │ ├── bricks/ │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── context_block.py │ │ │ ├── conv.py │ │ │ ├── conv2d_adaptive_padding.py │ │ │ ├── conv_module.py │ │ │ ├── conv_ws.py │ │ │ ├── depthwise_separable_conv_module.py │ │ │ ├── drop.py │ │ │ ├── generalized_attention.py │ │ │ ├── hsigmoid.py │ │ │ ├── hswish.py │ │ │ ├── non_local.py │ │ │ ├── norm.py │ │ │ ├── padding.py │ │ │ ├── plugin.py │ │ │ ├── scale.py │ │ │ ├── swish.py │ │ │ ├── transformer.py │ │ │ ├── upsample.py │ │ │ └── wrappers.py │ │ ├── resnet.py │ │ ├── rfsearch/ │ │ │ ├── __init__.py │ │ │ ├── operator.py │ │ │ ├── search.py │ │ │ └── utils.py │ │ ├── utils/ │ │ │ ├── __init__.py │ │ │ ├── flops_counter.py │ │ │ └── fuse_conv_bn.py │ │ └── vgg.py │ ├── image/ │ │ ├── __init__.py │ │ ├── colorspace.py │ │ ├── geometric.py │ │ ├── io.py │ │ ├── misc.py │ │ └── photometric.py │ ├── ops/ │ │ ├── __init__.py │ │ ├── active_rotated_filter.py │ │ ├── assign_score_withk.py │ │ ├── ball_query.py │ │ ├── bbox.py │ │ ├── bezier_align.py │ │ ├── bias_act.py │ │ ├── border_align.py │ │ ├── box_iou_quadri.py │ │ ├── box_iou_rotated.py │ │ ├── carafe.py │ │ ├── cc_attention.py │ │ ├── chamfer_distance.py │ │ ├── contour_expand.py │ │ ├── conv2d_gradfix.py │ │ ├── convex_iou.py │ │ ├── corner_pool.py │ │ ├── correlation.py │ │ ├── csrc/ │ │ │ ├── README.md │ │ │ ├── common/ │ │ │ │ ├── box_iou_rotated_utils.hpp │ │ │ │ ├── cuda/ │ │ │ │ │ ├── active_rotated_filter_cuda_kernel.cuh │ │ │ │ │ ├── assign_score_withk_cuda_kernel.cuh │ │ │ │ │ ├── ball_query_cuda_kernel.cuh │ │ │ │ │ ├── bbox_overlaps_cuda_kernel.cuh │ │ │ │ │ ├── bezier_align_cuda_kernel.cuh │ │ │ │ │ ├── border_align_cuda_kernel.cuh │ │ │ │ │ ├── box_iou_quadri_cuda.cuh │ │ │ │ │ ├── box_iou_rotated_cuda.cuh │ │ │ │ │ ├── carafe_cuda_kernel.cuh │ │ │ │ │ ├── carafe_naive_cuda_kernel.cuh │ │ │ │ │ ├── chamfer_distance_cuda_kernel.cuh │ │ │ │ │ ├── common_cuda_helper.hpp │ │ │ │ │ ├── convex_iou_cuda_kernel.cuh │ │ │ │ │ ├── correlation_cuda.cuh │ │ │ │ │ ├── deform_conv_cuda_kernel.cuh │ │ │ │ │ ├── deform_roi_pool_cuda_kernel.cuh │ │ │ │ │ ├── diff_iou_rotated_cuda_kernel.cuh │ │ │ │ │ ├── furthest_point_sample_cuda_kernel.cuh │ │ │ │ │ ├── gather_points_cuda_kernel.cuh │ │ │ │ │ ├── group_points_cuda_kernel.cuh │ │ │ │ │ ├── iou3d_cuda_kernel.cuh │ │ │ │ │ ├── knn_cuda_kernel.cuh │ │ │ │ │ ├── masked_conv2d_cuda_kernel.cuh │ │ │ │ │ ├── min_area_polygons_cuda.cuh │ │ │ │ │ ├── modulated_deform_conv_cuda_kernel.cuh │ │ │ │ │ ├── ms_deform_attn_cuda_kernel.cuh │ │ │ │ │ ├── nms_cuda_kernel.cuh │ │ │ │ │ ├── nms_quadri_cuda.cuh │ │ │ │ │ ├── nms_rotated_cuda.cuh │ │ │ │ │ ├── parrots_cudawarpfunction.cuh │ │ │ │ │ ├── points_in_boxes_cuda_kernel.cuh │ │ │ │ │ ├── points_in_polygons_cuda_kernel.cuh │ │ │ │ │ ├── prroi_pool_cuda_kernel.cuh │ │ │ │ │ ├── psamask_cuda_kernel.cuh │ │ │ │ │ ├── riroi_align_rotated_cuda_kernel.cuh │ │ │ │ │ ├── roi_align_cuda_kernel.cuh │ │ │ │ │ ├── roi_align_rotated_cuda_kernel.cuh │ │ │ │ │ ├── roi_pool_cuda_kernel.cuh │ │ │ │ │ ├── roiaware_pool3d_cuda_kernel.cuh │ │ │ │ │ ├── roipoint_pool3d_cuda_kernel.cuh │ │ │ │ │ ├── rotated_feature_align_cuda_kernel.cuh │ │ │ │ │ ├── scatter_points_cuda_kernel.cuh │ │ │ │ │ ├── sigmoid_focal_loss_cuda_kernel.cuh │ │ │ │ │ ├── softmax_focal_loss_cuda_kernel.cuh │ │ │ │ │ ├── spconv/ │ │ │ │ │ │ ├── indice.cuh │ │ │ │ │ │ └── reordering.cuh │ │ │ │ │ ├── stack_ball_query_cuda_kernel.cuh │ │ │ │ │ ├── stack_group_points_cuda_kernel.cuh │ │ │ │ │ ├── sync_bn_cuda_kernel.cuh │ │ │ │ │ ├── three_interpolate_cuda_kernel.cuh │ │ │ │ │ ├── three_nn_cuda_kernel.cuh │ │ │ │ │ ├── tin_shift_cuda_kernel.cuh │ │ │ │ │ └── voxelization_cuda_kernel.cuh │ │ │ │ ├── mlu/ │ │ │ │ │ ├── common_mlu_helper.hpp │ │ │ │ │ ├── masked_conv2d_mlu_kernel.mlu │ │ │ │ │ └── roi_pool_mlu_kernel.mlu │ │ │ │ ├── mps/ │ │ │ │ │ ├── MPSDevice.h │ │ │ │ │ ├── MPSLibrary.h │ │ │ │ │ ├── MPSLibrary.mm │ │ │ │ │ ├── MPSStream.h │ │ │ │ │ └── MPSUtils.h │ │ │ │ ├── musa/ │ │ │ │ │ ├── active_rotated_filter_musa_kernel.muh │ │ │ │ │ ├── assign_score_withk_musa_kernel.muh │ │ │ │ │ ├── ball_query_musa_kernel.muh │ │ │ │ │ ├── bbox_overlaps_musa_kernel.muh │ │ │ │ │ ├── bezier_align_musa_kernel.muh │ │ │ │ │ ├── border_align_musa_kernel.muh │ │ │ │ │ ├── box_iou_quadri_musa.muh │ │ │ │ │ ├── box_iou_rotated_musa.muh │ │ │ │ │ ├── carafe_musa_kernel.muh │ │ │ │ │ ├── carafe_naive_musa_kernel.muh │ │ │ │ │ ├── chamfer_distance_musa_kernel.muh │ │ │ │ │ ├── common_musa_helper.hpp │ │ │ │ │ ├── convex_iou_musa_kernel.muh │ │ │ │ │ ├── correlation_musa.muh │ │ │ │ │ ├── deform_conv_musa_kernel.muh │ │ │ │ │ ├── deform_roi_pool_musa_kernel.muh │ │ │ │ │ ├── diff_iou_rotated_musa_kernel.muh │ │ │ │ │ ├── furthest_point_sample_musa_kernel.muh │ │ │ │ │ ├── gather_points_musa_kernel.muh │ │ │ │ │ ├── group_points_musa_kernel.muh │ │ │ │ │ ├── iou3d_musa_kernel.muh │ │ │ │ │ ├── knn_musa_kernel.muh │ │ │ │ │ ├── masked_conv2d_musa_kernel.muh │ │ │ │ │ ├── min_area_polygons_musa.muh │ │ │ │ │ ├── modulated_deform_conv_musa_kernel.muh │ │ │ │ │ ├── ms_deform_attn_musa_kernel.muh │ │ │ │ │ ├── nms_musa_kernel.muh │ │ │ │ │ ├── nms_quadri_musa.muh │ │ │ │ │ ├── nms_rotated_musa.muh │ │ │ │ │ ├── points_in_boxes_musa_kernel.muh │ │ │ │ │ ├── points_in_polygons_musa_kernel.muh │ │ │ │ │ ├── prroi_pool_musa_kernel.muh │ │ │ │ │ ├── psamask_musa_kernel.muh │ │ │ │ │ ├── riroi_align_rotated_musa_kernel.muh │ │ │ │ │ ├── roi_align_musa_kernel.muh │ │ │ │ │ ├── roi_align_rotated_musa_kernel.muh │ │ │ │ │ ├── roi_pool_musa_kernel.muh │ │ │ │ │ ├── roiaware_pool3d_musa_kernel.muh │ │ │ │ │ ├── roipoint_pool3d_musa_kernel.muh │ │ │ │ │ ├── rotated_feature_align_musa_kernel.muh │ │ │ │ │ ├── scatter_points_musa_kernel.muh │ │ │ │ │ ├── sigmoid_focal_loss_musa_kernel.muh │ │ │ │ │ ├── softmax_focal_loss_musa_kernel.muh │ │ │ │ │ ├── spconv/ │ │ │ │ │ │ ├── indice.muh │ │ │ │ │ │ └── reordering.muh │ │ │ │ │ ├── stack_ball_query_musa_kernel.muh │ │ │ │ │ ├── stack_group_points_musa_kernel.muh │ │ │ │ │ ├── sync_bn_musa_kernel.muh │ │ │ │ │ ├── three_interpolate_musa_kernel.muh │ │ │ │ │ ├── three_nn_musa_kernel.muh │ │ │ │ │ ├── tin_shift_musa_kernel.muh │ │ │ │ │ └── voxelization_musa_kernel.muh │ │ │ │ ├── parrots_cpp_helper.hpp │ │ │ │ ├── parrots_cuda_helper.hpp │ │ │ │ ├── pytorch_cpp_helper.hpp │ │ │ │ ├── pytorch_cuda_helper.hpp │ │ │ │ ├── pytorch_device_registry.hpp │ │ │ │ ├── pytorch_mlu_helper.hpp │ │ │ │ ├── pytorch_musa_helper.hpp │ │ │ │ ├── pytorch_npu_helper.hpp │ │ │ │ ├── pytorch_npu_util.hpp │ │ │ │ └── utils/ │ │ │ │ └── spconv/ │ │ │ │ ├── paramsgrid.h │ │ │ │ ├── prettyprint.h │ │ │ │ ├── pybind11_utils.h │ │ │ │ ├── spconv/ │ │ │ │ │ ├── geometry.h │ │ │ │ │ ├── indice.h │ │ │ │ │ ├── maxpool.h │ │ │ │ │ ├── mp_helper.h │ │ │ │ │ ├── point2voxel.h │ │ │ │ │ └── reordering.h │ │ │ │ └── tensorview/ │ │ │ │ ├── helper_kernel.cuh │ │ │ │ ├── helper_kernel.muh │ │ │ │ ├── helper_launch.h │ │ │ │ └── tensorview.h │ │ │ ├── parrots/ │ │ │ │ ├── active_rotated_filter.cpp │ │ │ │ ├── active_rotated_filter_parrots.cpp │ │ │ │ ├── active_rotated_filter_pytorch.h │ │ │ │ ├── assign_score_withk.cpp │ │ │ │ ├── assign_score_withk_parrots.cpp │ │ │ │ ├── assign_score_withk_pytorch.h │ │ │ │ ├── ball_query._parrots.cpp │ │ │ │ ├── ball_query.cpp │ │ │ │ ├── ball_query_pytorch.h │ │ │ │ ├── bbox_overlaps.cpp │ │ │ │ ├── bbox_overlaps_parrots.cpp │ │ │ │ ├── bbox_overlaps_pytorch.h │ │ │ │ ├── border_align.cpp │ │ │ │ ├── border_align_parrots.cpp │ │ │ │ ├── border_align_pytorch.h │ │ │ │ ├── box_iou_rotated.cpp │ │ │ │ ├── box_iou_rotated_parrots.cpp │ │ │ │ ├── box_iou_rotated_pytorch.h │ │ │ │ ├── carafe.cpp │ │ │ │ ├── carafe_naive.cpp │ │ │ │ ├── carafe_naive_parrots.cpp │ │ │ │ ├── carafe_naive_pytorch.h │ │ │ │ ├── carafe_parrots.cpp │ │ │ │ ├── carafe_pytorch.h │ │ │ │ ├── chamfer_distance.cpp │ │ │ │ ├── chamfer_distance_parrots.cpp │ │ │ │ ├── chamfer_distance_pytorch.h │ │ │ │ ├── contour_expand.cpp │ │ │ │ ├── contour_expand_parrots.cpp │ │ │ │ ├── contour_expand_pytorch.h │ │ │ │ ├── convex_iou.cpp │ │ │ │ ├── convex_iou_parrots.cpp │ │ │ │ ├── convex_iou_pytorch.h │ │ │ │ ├── correlation.cpp │ │ │ │ ├── correlation_parrots.cpp │ │ │ │ ├── correlation_pytorch.h │ │ │ │ ├── cudabind.cpp │ │ │ │ ├── deform_conv.cpp │ │ │ │ ├── deform_conv_parrots.cpp │ │ │ │ ├── deform_conv_pytorch.h │ │ │ │ ├── deform_roi_pool.cpp │ │ │ │ ├── deform_roi_pool_parrots.cpp │ │ │ │ ├── deform_roi_pool_pytorch.h │ │ │ │ ├── diff_iou_rotated.cpp │ │ │ │ ├── diff_iou_rotated_parrots.cpp │ │ │ │ ├── diff_iou_rotated_pytorch.h │ │ │ │ ├── focal_loss.cpp │ │ │ │ ├── focal_loss_parrots.cpp │ │ │ │ ├── focal_loss_pytorch.h │ │ │ │ ├── furthest_point_sample.cpp │ │ │ │ ├── furthest_point_sample_parrots.cpp │ │ │ │ ├── furthest_point_sample_pytorch.h │ │ │ │ ├── fused_bias_leakyrelu.cpp │ │ │ │ ├── fused_bias_parrots.cpp │ │ │ │ ├── gather_points.cpp │ │ │ │ ├── gather_points_parrots.cpp │ │ │ │ ├── gather_points_pytorch.h │ │ │ │ ├── group_points.cpp │ │ │ │ ├── group_points_parrots.cpp │ │ │ │ ├── group_points_pytorch.h │ │ │ │ ├── info.cpp │ │ │ │ ├── iou3d.cpp │ │ │ │ ├── iou3d_parrots.cpp │ │ │ │ ├── iou3d_pytorch.h │ │ │ │ ├── knn.cpp │ │ │ │ ├── knn_parrots.cpp │ │ │ │ ├── knn_pytorch.h │ │ │ │ ├── masked_conv2d.cpp │ │ │ │ ├── masked_conv2d_parrots.cpp │ │ │ │ ├── masked_conv2d_pytorch.h │ │ │ │ ├── min_area_polygons.cpp │ │ │ │ ├── min_area_polygons_parrots.cpp │ │ │ │ ├── min_area_polygons_pytorch.h │ │ │ │ ├── modulated_deform_conv.cpp │ │ │ │ ├── modulated_deform_conv_parrots.cpp │ │ │ │ ├── modulated_deform_conv_pytorch.h │ │ │ │ ├── ms_deform_attn.cpp │ │ │ │ ├── ms_deform_attn_parrots.cpp │ │ │ │ ├── nms.cpp │ │ │ │ ├── nms_parrots.cpp │ │ │ │ ├── nms_pytorch.h │ │ │ │ ├── nms_rotated.cpp │ │ │ │ ├── pixel_group.cpp │ │ │ │ ├── pixel_group_parrots.cpp │ │ │ │ ├── pixel_group_pytorch.h │ │ │ │ ├── points_in_boxes.cpp │ │ │ │ ├── points_in_boxes_parrots.cpp │ │ │ │ ├── points_in_boxes_pytorch.h │ │ │ │ ├── points_in_polygons.cpp │ │ │ │ ├── points_in_polygons_parrots.cpp │ │ │ │ ├── points_in_polygons_pytorch.h │ │ │ │ ├── prroi_pool.cpp │ │ │ │ ├── prroi_pool_parrots.cpp │ │ │ │ ├── prroi_pool_pytorch.h │ │ │ │ ├── psamask.cpp │ │ │ │ ├── psamask_parrots.cpp │ │ │ │ ├── psamask_pytorch.h │ │ │ │ ├── riroi_align_rotated.cpp │ │ │ │ ├── riroi_align_rotated_parrots.cpp │ │ │ │ ├── riroi_align_rotated_pytorch.h │ │ │ │ ├── roi_align.cpp │ │ │ │ ├── roi_align_parrots.cpp │ │ │ │ ├── roi_align_pytorch.h │ │ │ │ ├── roi_align_rotated.cpp │ │ │ │ ├── roi_align_rotated_parrots.cpp │ │ │ │ ├── roi_align_rotated_pytorch.h │ │ │ │ ├── roi_pool.cpp │ │ │ │ ├── roi_pool_parrots.cpp │ │ │ │ ├── roi_pool_pytorch.h │ │ │ │ ├── roiaware_pool3d.cpp │ │ │ │ ├── roiaware_pool3d_parrots.cpp │ │ │ │ ├── roiaware_pool3d_pytorch.h │ │ │ │ ├── roipoint_pool3d.cpp │ │ │ │ ├── roipoint_pool3d_parrots.cpp │ │ │ │ ├── roipoint_pool3d_pytorch.h │ │ │ │ ├── rotated_feature_align.cpp │ │ │ │ ├── rotated_feature_align_parrots.cpp │ │ │ │ ├── rotated_feature_align_pytorch.h │ │ │ │ ├── sync_bn.cpp │ │ │ │ ├── sync_bn_parrots.cpp │ │ │ │ ├── sync_bn_pytorch.h │ │ │ │ ├── three_interpolate.cpp │ │ │ │ ├── three_interpolate_parrots.cpp │ │ │ │ ├── three_interpolate_pytorch.h │ │ │ │ ├── three_nn.cpp │ │ │ │ ├── three_nn_parrots.cpp │ │ │ │ ├── three_nn_pytorch.h │ │ │ │ ├── tin_shift.cpp │ │ │ │ ├── tin_shift_parrots.cpp │ │ │ │ ├── tin_shift_pytorch.h │ │ │ │ ├── upfirdn2d.cpp │ │ │ │ ├── upfirdn2d_parrots.cpp │ │ │ │ ├── voxelization.cpp │ │ │ │ ├── voxelization_parrots.cpp │ │ │ │ └── voxelization_pytorch.h │ │ │ └── pytorch/ │ │ │ ├── active_rotated_filter.cpp │ │ │ ├── assign_score_withk.cpp │ │ │ ├── ball_query.cpp │ │ │ ├── bbox_overlaps.cpp │ │ │ ├── bezier_align.cpp │ │ │ ├── bias_act.cpp │ │ │ ├── border_align.cpp │ │ │ ├── box_iou_quadri.cpp │ │ │ ├── box_iou_rotated.cpp │ │ │ ├── carafe.cpp │ │ │ ├── carafe_naive.cpp │ │ │ ├── chamfer_distance.cpp │ │ │ ├── contour_expand.cpp │ │ │ ├── convex_iou.cpp │ │ │ ├── correlation.cpp │ │ │ ├── cpu/ │ │ │ │ ├── active_rotated_filter.cpp │ │ │ │ ├── bbox_overlaps_cpu.cpp │ │ │ │ ├── bezier_align.cpp │ │ │ │ ├── box_iou_quadri.cpp │ │ │ │ ├── box_iou_rotated.cpp │ │ │ │ ├── deform_conv.cpp │ │ │ │ ├── modulated_deform_conv.cpp │ │ │ │ ├── nms.cpp │ │ │ │ ├── nms_quadri.cpp │ │ │ │ ├── nms_rotated.cpp │ │ │ │ ├── pixel_group.cpp │ │ │ │ ├── points_in_boxes.cpp │ │ │ │ ├── psamask.cpp │ │ │ │ ├── roi_align.cpp │ │ │ │ ├── roi_align_rotated.cpp │ │ │ │ ├── rotated_feature_align.cpp │ │ │ │ ├── sparse_indice.cpp │ │ │ │ ├── sparse_maxpool.cpp │ │ │ │ ├── sparse_reordering.cpp │ │ │ │ └── voxelization.cpp │ │ │ ├── cuda/ │ │ │ │ ├── active_rotated_filter_cuda.cu │ │ │ │ ├── assign_score_withk_cuda.cu │ │ │ │ ├── ball_query_cuda.cu │ │ │ │ ├── bbox_overlaps_cuda.cu │ │ │ │ ├── bezier_align_cuda.cu │ │ │ │ ├── bias_act_cuda.cu │ │ │ │ ├── border_align_cuda.cu │ │ │ │ ├── box_iou_quadri_cuda.cu │ │ │ │ ├── box_iou_rotated_cuda.cu │ │ │ │ ├── carafe_cuda.cu │ │ │ │ ├── carafe_naive_cuda.cu │ │ │ │ ├── chamfer_distance_cuda.cu │ │ │ │ ├── convex_iou.cu │ │ │ │ ├── correlation_cuda.cu │ │ │ │ ├── cudabind.cpp │ │ │ │ ├── deform_conv_cuda.cu │ │ │ │ ├── deform_roi_pool_cuda.cu │ │ │ │ ├── diff_iou_rotated_cuda.cu │ │ │ │ ├── filtered_lrelu.cu │ │ │ │ ├── focal_loss_cuda.cu │ │ │ │ ├── furthest_point_sample_cuda.cu │ │ │ │ ├── fused_bias_leakyrelu_cuda.cu │ │ │ │ ├── fused_spconv_ops_cuda.cu │ │ │ │ ├── gather_points_cuda.cu │ │ │ │ ├── group_points_cuda.cu │ │ │ │ ├── iou3d_cuda.cu │ │ │ │ ├── knn_cuda.cu │ │ │ │ ├── masked_conv2d_cuda.cu │ │ │ │ ├── min_area_polygons.cu │ │ │ │ ├── modulated_deform_conv_cuda.cu │ │ │ │ ├── ms_deform_attn_cuda.cu │ │ │ │ ├── nms_cuda.cu │ │ │ │ ├── nms_quadri_cuda.cu │ │ │ │ ├── nms_rotated_cuda.cu │ │ │ │ ├── points_in_boxes_cuda.cu │ │ │ │ ├── points_in_polygons_cuda.cu │ │ │ │ ├── prroi_pool_cuda.cu │ │ │ │ ├── psamask_cuda.cu │ │ │ │ ├── riroi_align_rotated_cuda.cu │ │ │ │ ├── roi_align_cuda.cu │ │ │ │ ├── roi_align_rotated_cuda.cu │ │ │ │ ├── roi_pool_cuda.cu │ │ │ │ ├── roiaware_pool3d_cuda.cu │ │ │ │ ├── roipoint_pool3d_cuda.cu │ │ │ │ ├── rotated_feature_align_cuda.cu │ │ │ │ ├── scatter_points_cuda.cu │ │ │ │ ├── sparse_indice.cu │ │ │ │ ├── sparse_maxpool.cu │ │ │ │ ├── sparse_pool_ops_cuda.cu │ │ │ │ ├── sparse_reordering.cu │ │ │ │ ├── spconv_ops_cuda.cu │ │ │ │ ├── stack_ball_query_cuda.cu │ │ │ │ ├── stack_group_points_cuda.cu │ │ │ │ ├── sync_bn_cuda.cu │ │ │ │ ├── three_interpolate_cuda.cu │ │ │ │ ├── three_nn_cuda.cu │ │ │ │ ├── tin_shift_cuda.cu │ │ │ │ ├── upfirdn2d_kernel.cu │ │ │ │ └── voxelization_cuda.cu │ │ │ ├── deform_conv.cpp │ │ │ ├── deform_roi_pool.cpp │ │ │ ├── diff_iou_rotated.cpp │ │ │ ├── filtered_lrelu.cpp │ │ │ ├── focal_loss.cpp │ │ │ ├── furthest_point_sample.cpp │ │ │ ├── fused_bias_leakyrelu.cpp │ │ │ ├── fused_spconv_ops.cpp │ │ │ ├── gather_points.cpp │ │ │ ├── group_points.cpp │ │ │ ├── info.cpp │ │ │ ├── iou3d.cpp │ │ │ ├── knn.cpp │ │ │ ├── masked_conv2d.cpp │ │ │ ├── min_area_polygons.cpp │ │ │ ├── mlu/ │ │ │ │ ├── ball_query_mlu.cpp │ │ │ │ ├── bbox_overlaps_mlu.cpp │ │ │ │ ├── box_iou_rotated.cpp │ │ │ │ ├── carafe_mlu.cpp │ │ │ │ ├── deform_roi_pool_mlu.cpp │ │ │ │ ├── diff_iou_rotated_mlu.cpp │ │ │ │ ├── focal_loss_sigmoid_mlu.cpp │ │ │ │ ├── iou3d_mlu.cpp │ │ │ │ ├── masked_conv2d_mlu.cpp │ │ │ │ ├── mlu_common_helper.cpp │ │ │ │ ├── mlu_common_helper.h │ │ │ │ ├── ms_deform_attn_mlu.cpp │ │ │ │ ├── nms_mlu.cpp │ │ │ │ ├── nms_rotated_mlu.cpp │ │ │ │ ├── psamask_mlu.cpp │ │ │ │ ├── roi_align_mlu.cpp │ │ │ │ ├── roi_align_rotated_mlu.cpp │ │ │ │ ├── roi_pool_mlu.cpp │ │ │ │ ├── roiaware_pool3d_mlu.cpp │ │ │ │ ├── roipoint_pool3d_mlu.cpp │ │ │ │ ├── rotated_feature_align_mlu.cpp │ │ │ │ ├── scatter_points_mlu.cpp │ │ │ │ ├── sparse_conv_mlu.cpp │ │ │ │ ├── three_nn_mlu.cpp │ │ │ │ ├── tin_shift_mlu.cpp │ │ │ │ └── voxelization_mlu.cpp │ │ │ ├── modulated_deform_conv.cpp │ │ │ ├── mps/ │ │ │ │ └── bbox_overlaps_mps.mm │ │ │ ├── ms_deform_attn.cpp │ │ │ ├── musa/ │ │ │ │ ├── active_rotated_filter_musa.mu │ │ │ │ ├── assign_score_withk_musa.mu │ │ │ │ ├── ball_query_musa.mu │ │ │ │ ├── bbox_overlaps_musa.mu │ │ │ │ ├── bezier_align_musa.mu │ │ │ │ ├── bias_act_musa.mu │ │ │ │ ├── border_align_musa.mu │ │ │ │ ├── box_iou_quadri_musa.mu │ │ │ │ ├── box_iou_rotated_musa.mu │ │ │ │ ├── carafe_musa.mu │ │ │ │ ├── carafe_naive_musa.mu │ │ │ │ ├── chamfer_distance_musa.mu │ │ │ │ ├── convex_iou.mu │ │ │ │ ├── correlation_musa.mu │ │ │ │ ├── deform_conv_musa.mu │ │ │ │ ├── deform_roi_pool_musa.mu │ │ │ │ ├── diff_iou_rotated_musa.mu │ │ │ │ ├── filtered_lrelu.mu │ │ │ │ ├── focal_loss_musa.mu │ │ │ │ ├── furthest_point_sample_musa.mu │ │ │ │ ├── fused_bias_leakyrelu_musa.mu │ │ │ │ ├── fused_spconv_ops_musa.mu │ │ │ │ ├── gather_points_musa.mu │ │ │ │ ├── group_points_musa.mu │ │ │ │ ├── iou3d_musa.mu │ │ │ │ ├── knn_musa.mu │ │ │ │ ├── masked_conv2d_musa.mu │ │ │ │ ├── min_area_polygons.mu │ │ │ │ ├── modulated_deform_conv_musa.mu │ │ │ │ ├── ms_deform_attn_musa.mu │ │ │ │ ├── musabind.cpp │ │ │ │ ├── nms_musa.mu │ │ │ │ ├── nms_quadri_musa.mu │ │ │ │ ├── nms_rotated_musa.mu │ │ │ │ ├── points_in_boxes_musa.mu │ │ │ │ ├── points_in_polygons_musa.mu │ │ │ │ ├── prroi_pool_musa.mu │ │ │ │ ├── psamask_musa.mu │ │ │ │ ├── riroi_align_rotated_musa.mu │ │ │ │ ├── roi_align_musa.mu │ │ │ │ ├── roi_align_rotated_musa.mu │ │ │ │ ├── roi_pool_musa.mu │ │ │ │ ├── roiaware_pool3d_musa.mu │ │ │ │ ├── roipoint_pool3d_musa.mu │ │ │ │ ├── rotated_feature_align_musa.mu │ │ │ │ ├── scatter_points_musa.mu │ │ │ │ ├── sparse_indice.mu │ │ │ │ ├── sparse_maxpool.mu │ │ │ │ ├── sparse_pool_ops_musa.mu │ │ │ │ ├── sparse_reordering.mu │ │ │ │ ├── spconv_ops_musa.mu │ │ │ │ ├── stack_ball_query_musa.mu │ │ │ │ ├── stack_group_points_musa.mu │ │ │ │ ├── sync_bn_musa.mu │ │ │ │ ├── three_interpolate_musa.mu │ │ │ │ ├── three_nn_musa.mu │ │ │ │ ├── tin_shift_musa.mu │ │ │ │ ├── upfirdn2d_kernel.mu │ │ │ │ └── voxelization_musa.mu │ │ │ ├── nms.cpp │ │ │ ├── nms_quadri.cpp │ │ │ ├── nms_rotated.cpp │ │ │ ├── npu/ │ │ │ │ ├── active_rotated_filter_npu.cpp │ │ │ │ ├── assign_score_withk_npu.cpp │ │ │ │ ├── ball_query_npu.cpp │ │ │ │ ├── bbox_overlaps_npu.cpp │ │ │ │ ├── border_align_npu.cpp │ │ │ │ ├── box_iou_quadri_npu.cpp │ │ │ │ ├── box_iou_rotated_npu.cpp │ │ │ │ ├── boxes_overlap_bev_npu.cpp │ │ │ │ ├── chamfer_distance_npu.cpp │ │ │ │ ├── common_util.h │ │ │ │ ├── deform_roi_pool.cpp │ │ │ │ ├── diff_iou_rotated_npu.cpp │ │ │ │ ├── focal_loss_npu.cpp │ │ │ │ ├── furthest_point_sample_npu.cpp │ │ │ │ ├── furthest_point_sampling_with_dist_npu.cpp │ │ │ │ ├── fused_bias_leakyrelu_npu.cpp │ │ │ │ ├── gather_points_npu.cpp │ │ │ │ ├── group_points_npu.cpp │ │ │ │ ├── knn_npu.cpp │ │ │ │ ├── ms_deform_attn_npu.cpp │ │ │ │ ├── nms3d_normal_npu.cpp │ │ │ │ ├── nms3d_npu.cpp │ │ │ │ ├── nms_npu.cpp │ │ │ │ ├── nms_rotated_npu.cpp │ │ │ │ ├── points_in_box_npu.cpp │ │ │ │ ├── points_in_box_npu_all.cpp │ │ │ │ ├── points_in_polygons_npu.cpp │ │ │ │ ├── psa_mask_npu.cpp │ │ │ │ ├── roi_align_npu.cpp │ │ │ │ ├── roi_align_rotated_npu.cpp │ │ │ │ ├── roi_pool_npu.cpp │ │ │ │ ├── roiaware_pool3d_npu.cpp │ │ │ │ ├── roipoint_pool3d_forward.cpp │ │ │ │ ├── rotated_feature_align_npu.cpp │ │ │ │ ├── stack_ball_query_npu.cpp │ │ │ │ ├── stack_group_points_npu.cpp │ │ │ │ ├── three_interpolate_npu.cpp │ │ │ │ ├── three_nn_npu.cpp │ │ │ │ └── voxelization_npu.cpp │ │ │ ├── pixel_group.cpp │ │ │ ├── points_in_boxes.cpp │ │ │ ├── points_in_polygons.cpp │ │ │ ├── prroi_pool.cpp │ │ │ ├── psamask.cpp │ │ │ ├── pybind.cpp │ │ │ ├── riroi_align_rotated.cpp │ │ │ ├── roi_align.cpp │ │ │ ├── roi_align_rotated.cpp │ │ │ ├── roi_pool.cpp │ │ │ ├── roiaware_pool3d.cpp │ │ │ ├── roipoint_pool3d.cpp │ │ │ ├── rotated_feature_align.cpp │ │ │ ├── scatter_points.cpp │ │ │ ├── sparse_pool_ops.cpp │ │ │ ├── spconv_ops.cpp │ │ │ ├── spconv_utils.h │ │ │ ├── sync_bn.cpp │ │ │ ├── three_interpolate.cpp │ │ │ ├── three_nn.cpp │ │ │ ├── tin_shift.cpp │ │ │ ├── upfirdn2d.cpp │ │ │ └── voxelization.cpp │ │ ├── deform_conv.py │ │ ├── deform_roi_pool.py │ │ ├── deprecated_wrappers.py │ │ ├── diff_iou_rotated.py │ │ ├── filtered_lrelu.py │ │ ├── focal_loss.py │ │ ├── furthest_point_sample.py │ │ ├── fused_bias_leakyrelu.py │ │ ├── gather_points.py │ │ ├── group_points.py │ │ ├── info.py │ │ ├── iou3d.py │ │ ├── knn.py │ │ ├── masked_conv.py │ │ ├── merge_cells.py │ │ ├── min_area_polygons.py │ │ ├── modulated_deform_conv.py │ │ ├── multi_scale_deform_attn.py │ │ ├── nms.py │ │ ├── pixel_group.py │ │ ├── point_sample.py │ │ ├── points_in_boxes.py │ │ ├── points_in_polygons.py │ │ ├── points_sampler.py │ │ ├── prroi_pool.py │ │ ├── psa_mask.py │ │ ├── riroi_align_rotated.py │ │ ├── roi_align.py │ │ ├── roi_align_rotated.py │ │ ├── roi_pool.py │ │ ├── roiaware_pool3d.py │ │ ├── roipoint_pool3d.py │ │ ├── rotated_feature_align.py │ │ ├── saconv.py │ │ ├── scatter_points.py │ │ ├── sparse_conv.py │ │ ├── sparse_functional.py │ │ ├── sparse_modules.py │ │ ├── sparse_ops.py │ │ ├── sparse_pool.py │ │ ├── sparse_structure.py │ │ ├── sync_bn.py │ │ ├── three_interpolate.py │ │ ├── three_nn.py │ │ ├── tin_shift.py │ │ ├── upfirdn2d.py │ │ └── voxelize.py │ ├── transforms/ │ │ ├── __init__.py │ │ ├── base.py │ │ ├── builder.py │ │ ├── formatting.py │ │ ├── loading.py │ │ ├── processing.py │ │ ├── utils.py │ │ └── wrappers.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── device_type.py │ │ ├── env.py │ │ ├── ext_loader.py │ │ └── parrots_jit.py │ ├── version.py │ ├── video/ │ │ ├── __init__.py │ │ ├── io.py │ │ ├── optflow.py │ │ └── processing.py │ └── visualization/ │ ├── __init__.py │ ├── color.py │ ├── image.py │ └── optflow.py ├── requirements/ │ ├── build.txt │ ├── docs.txt │ ├── optional.txt │ ├── runtime.txt │ └── test.txt ├── requirements.txt ├── setup.cfg ├── setup.py └── tests/ ├── test_arraymisc.py ├── test_cnn/ │ ├── test_build_layers.py │ ├── test_context_block.py │ ├── test_conv2d_adaptive_padding.py │ ├── test_conv_module.py │ ├── test_depthwise_seperable_conv_module.py │ ├── test_flops_counter.py │ ├── test_fuse_conv_bn.py │ ├── test_generalized_attention.py │ ├── test_hsigmoid.py │ ├── test_hswish.py │ ├── test_non_local.py │ ├── test_rfsearch/ │ │ ├── test_operator.py │ │ └── test_search.py │ ├── test_scale.py │ ├── test_silu.py │ ├── test_swish.py │ ├── test_transformer.py │ └── test_wrappers.py ├── test_image/ │ ├── test_colorspace.py │ ├── test_geometric.py │ ├── test_image_misc.py │ ├── test_io.py │ └── test_photometric.py ├── test_ops/ │ ├── output.pkl │ ├── test_active_rotated_filter.py │ ├── test_assign_score_withk.py │ ├── test_ball_query.py │ ├── test_bbox.py │ ├── test_bezier_align.py │ ├── test_bias_act.py │ ├── test_bilinear_grid_sample.py │ ├── test_border_align.py │ ├── test_box_iou_quadri.py │ ├── test_box_iou_rotated.py │ ├── test_carafe.py │ ├── test_cc_attention.py │ ├── test_chamfer_distance.py │ ├── test_contour_expand.py │ ├── test_conv_gradfix.py │ ├── test_convex_iou.py │ ├── test_corner_pool.py │ ├── test_correlation.py │ ├── test_deform_conv.py │ ├── test_deform_roi_pool.py │ ├── test_diff_iou_rotated.py │ ├── test_filtered_lrelu.py │ ├── test_focal_loss.py │ ├── test_furthest_point_sample.py │ ├── test_fused_bias_leakyrelu.py │ ├── test_gather_points.py │ ├── test_group_points.py │ ├── test_info.py │ ├── test_iou3d.py │ ├── test_knn.py │ ├── test_masked_conv2d.py │ ├── test_merge_cells.py │ ├── test_min_area_polygons.py │ ├── test_modulated_deform_conv.py │ ├── test_ms_deformable_attn.py │ ├── test_nms.py │ ├── test_nms_quadri.py │ ├── test_nms_rotated.py │ ├── test_onnx.py │ ├── test_pixel_group.py │ ├── test_points_in_polygons.py │ ├── test_prroi_pool.py │ ├── test_psa_mask.py │ ├── test_riroi_align_rotated.py │ ├── test_roi_align.py │ ├── test_roi_align_rotated.py │ ├── test_roi_pool.py │ ├── test_roiaware_pool3d.py │ ├── test_roipoint_pool3d.py │ ├── test_rotated_feature_align.py │ ├── test_saconv.py │ ├── test_scatter_points.py │ ├── test_spconv.py │ ├── test_syncbn.py │ ├── test_three_interpolate.py │ ├── test_three_nn.py │ ├── test_tin_shift.py │ ├── test_upfirdn2d.py │ └── test_voxelization.py ├── test_transforms/ │ ├── test_transforms_formatting.py │ ├── test_transforms_loading.py │ ├── test_transforms_processing.py │ └── test_transforms_wrapper.py ├── test_utils/ │ ├── test_env.py │ └── test_parrots_jit.py ├── test_video/ │ ├── test_optflow.py │ ├── test_processing.py │ └── test_reader.py └── test_visualization.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dev_scripts/check_installation.py ================================================ import numpy as np import torch from mmcv.ops import box_iou_rotated from mmcv.utils import collect_env def check_installation(): """Check whether mmcv has been installed successfully.""" np_boxes1 = np.asarray( [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6], [7.0, 7.0, 8.0, 8.0, 0.4]], dtype=np.float32) np_boxes2 = np.asarray( [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5], [5.0, 5.0, 6.0, 7.0, 0.4]], dtype=np.float32) boxes1 = torch.from_numpy(np_boxes1) boxes2 = torch.from_numpy(np_boxes2) # test mmcv with CPU ops box_iou_rotated(boxes1, boxes2) print('CPU ops were compiled successfully.') # test mmcv with both CPU and CUDA ops if torch.cuda.is_available(): boxes1 = boxes1.cuda() boxes2 = boxes2.cuda() box_iou_rotated(boxes1, boxes2) print('CUDA ops were compiled successfully.') else: print('No CUDA runtime is found, skipping the checking of CUDA ops.') if __name__ == '__main__': print('Start checking the installation of mmcv ...') check_installation() print('mmcv has been installed successfully.\n') env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' print('Environment information:') print(dash_line + env_info + '\n' + dash_line) ================================================ FILE: .dockerignore ================================================ .git .gitignore *.egg-info .eggs/ .mypy-cache pip-wheel-metadata ================================================ FILE: .github/ISSUE_TEMPLATE/1-bug-report.yml ================================================ name: "🐞 Bug report" description: "Create a report to help us reproduce and fix the bug" labels: bug title: "[Bug] " body: - type: markdown attributes: value: | ## Note For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions) Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.** - type: checkboxes attributes: label: Prerequisite description: Please check the following items before creating a new issue. options: - label: I have searched [Issues](https://github.com/open-mmlab/mmcv/issues) and [Discussions](https://github.com/open-mmlab/mmcv/discussions) but cannot get the expected help. required: true - label: The bug has not been fixed in the latest version(https://github.com/open-mmlab/mmcv). required: true - type: textarea attributes: label: Environment description: | Please run `python -c "from mmcv.utils import collect_env; print(collect_env())"` to collect necessary environment information and copy-paste it here. You may add additional information that may be helpful for locating the problem, such as - How you installed PyTorch \[e.g., pip, conda, source\] - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.) validations: required: true - type: textarea attributes: label: Reproduces the problem - code sample description: | Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. placeholder: | ```python # Sample code to reproduce the problem ``` validations: required: true - type: textarea attributes: label: Reproduces the problem - command or script description: | What command or script did you run? placeholder: | ```shell The command or script you run. ``` validations: required: true - type: textarea attributes: label: Reproduces the problem - error message description: | Please provide the error message or logs you got, with the full traceback. Tip: You can attach images or log files by dragging them into the text area.. placeholder: | ``` The error message or logs you got, with the full traceback. ``` validations: required: true - type: textarea attributes: label: Additional information description: | Tell us anything else you think we should know. Tip: You can attach images or log files by dragging them into the text area. placeholder: | 1. What's your expected result? 2. What dataset did you use? 3. What do you think might be the reason? - type: markdown attributes: value: | ## Acknowledgement Thanks for taking the time to fill out this report. If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**Here**](https://github.com/open-mmlab/mmcv/pulls)! Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing. Welcome to join our [**Community (TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬 ================================================ FILE: .github/ISSUE_TEMPLATE/2-feature_request.yml ================================================ name: 🚀 Feature request description: Suggest an idea for this project labels: [feature-request] title: "[Feature] " body: - type: markdown attributes: value: | ## Note For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions) Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.** - type: textarea attributes: label: What is the feature? description: Tell us more about the feature and how this feature can help. placeholder: | E.g., It is inconvenient when \[....\]. validations: required: true - type: textarea attributes: label: Any other context? description: | Have you considered any alternative solutions or features? If so, what are they? Also, feel free to add any other context or screenshots about the feature request here. - type: markdown attributes: value: | ## Acknowledgement Thanks for taking the time to fill out this report. We strongly appreciate you creating a new PR to implement it [**Here**](https://github.com/open-mmlab/mmcv/pulls)! Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing. Welcome to join our [**Community (TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬 ================================================ FILE: .github/ISSUE_TEMPLATE/3-documentation.yml ================================================ name: 📚 Documentation description: Report an issue related to the documentation. labels: "docs" title: "[Docs] " body: - type: markdown attributes: value: | ## Note For general usage questions or idea discussions, please post it to our [**Forum**](https://github.com/open-mmlab/mmcv/discussions) Please fill in as **much** of the following form as you're able to. **The clearer the description, the shorter it will take to solve it.** - type: textarea attributes: label: 📚 The doc issue description: > A clear and concise description the issue. validations: required: true - type: textarea attributes: label: Suggest a potential alternative/fix description: > Tell us how we could improve the documentation in this regard. - type: markdown attributes: value: | ## Acknowledgement Thanks for taking the time to fill out this report. If you have already identified the reason, we strongly appreciate you creating a new PR to fix it [**here**](https://github.com/open-mmlab/mmcv/pulls)! Please refer to [**Contribution Guide**](https://mmcv.readthedocs.io/en/latest/community/contributing.html) for contributing. Welcome to join our [**Community(TODO)**](https://mmcv.readthedocs.io/en/latest/contact.html) to discuss together. 👬 ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false contact_links: - name: 💬 Forum url: https://github.com/open-mmlab/mmcv/discussions about: Ask general usage questions and discuss with other mmcv community members - name: MMCV Documentation url: https://mmcv.readthedocs.io/en/latest/ about: Check if your question is answered in docs - name: 🌐 Explore OpenMMLab url: https://openmmlab.com/ about: Get know more about OpenMMLab ================================================ FILE: .github/pull_request_template.md ================================================ Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers. ## Motivation Please describe the motivation of this PR and the goal you want to achieve through this PR. ## Modification Please briefly describe what modification is made in this PR. ## BC-breaking (Optional) Does the modification introduce changes that break the backward-compatibility of the downstream repositories? If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR. ## Use cases (Optional) If this PR introduces a new feature, it is better to list some use cases here, and update the documentation. ## Checklist **Before PR**: - [ ] I have read and followed the workflow indicated in the [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) to create this PR. - [ ] Pre-commit or linting tools indicated in [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) are used to fix the potential lint issues. - [ ] Bug fixes are covered by unit tests, the case that causes the bug should be added in the unit tests. - [ ] New functionalities are covered by complete unit tests. If not, please add more unit test to ensure the correctness. - [ ] The documentation has been modified accordingly, including docstring or example tutorials. **After PR**: - [ ] If the modification has potential influence on downstream or other related projects, this PR should be tested with some of those projects, like MMDet or MMCls. - [ ] CLA has been signed and all committers have signed the CLA in this PR. ================================================ FILE: .github/workflows/build_macos_wheel.yml ================================================ name: build macos wheel on: push concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: build_macos10_wheel: runs-on: macos-latest if: contains(github.event.head_commit.message, 'Bump version to') strategy: matrix: torch: [1.8.0, 1.9.0, 1.10.0, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0] python-version: [3.7, 3.8, 3.9, '3.10', '3.11'] include: - torch: 1.8.0 torchvision: 0.9.0 - torch: 1.9.0 torchvision: 0.10.0 - torch: 1.10.0 torchvision: 0.11.0 - torch: 1.11.0 torchvision: 0.12.0 - torch: 1.12.0 torchvision: 0.13.0 - torch: 1.13.0 torchvision: 0.14.0 - torch: 2.0.0 torchvision: 0.15.1 - torch: 2.1.0 torchvision: 0.16.0 exclude: - torch: 1.8.0 python-version: '3.10' - torch: 1.9.0 python-version: '3.10' - torch: 1.10.0 python-version: '3.10' - torch: 1.8.0 python-version: '3.11' - torch: 1.9.0 python-version: '3.11' - torch: 1.10.0 python-version: '3.11' - torch: 1.10.0 python-version: '3.11' - torch: 1.11.0 python-version: '3.11' - torch: 1.12.0 python-version: '3.11' - torch: 1.13.0 python-version: '3.11' - torch: 2.0.0 python-version: 3.7 - torch: 2.1.0 python-version: 3.7 steps: - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install psutil run: pip install psutil - name: Install PyTorch run: pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} --no-cache-dir - name: Build and install run: | pip install wheel python setup.py bdist_wheel - uses: actions/upload-artifact@v3 with: name: ${{matrix.torch}} path: dist/ ================================================ FILE: .github/workflows/lint.yml ================================================ name: lint on: [push, pull_request] concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: lint: runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 - name: Set up Python 3.10.15 uses: actions/setup-python@v2 with: python-version: '3.10.15' - name: Install pre-commit hook run: | pip install pre-commit pre-commit install - name: Linting run: pre-commit run --all-files - name: Format c/cuda codes with clang-format uses: DoozyX/clang-format-lint-action@v0.18 with: source: mmcv/ops/csrc extensions: h,c,cpp,hpp,cu,cuh style: google ================================================ FILE: .github/workflows/merge_stage_test.yml ================================================ name: merge_stage_test on: push: paths-ignore: - ".github/**.md" - "docker/**" - "docs/**" - 'examples/**' - '.dev_scripts/**' - "README.md" - "README_zh-CN.md" - "CONTRIBUTING.md" - ".pre-commit-config.yaml" - ".pre-commit-config-zh-cn.yaml" branches: - main concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: build_without_torch: runs-on: ubuntu-22.04 env: MMCV_WITH_OPS: 0 strategy: matrix: python-version: [3.7] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install system dependencies run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests run: pytest tests/test_image tests/test_transforms tests/test_video tests/test_arraymisc.py tests/test_visualization.py tests/test_utils/test_env.py --ignore=tests/test_image/test_io.py build_without_ops: runs-on: ubuntu-22.04 env: MMCV_WITH_OPS: 0 strategy: matrix: python-version: [3.7] torch: [1.8.1, 1.9.1] include: - torch: 1.8.1 torchvision: 0.9.1 - torch: 1.9.1 torchvision: 0.10.1 steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install system dependencies run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg - name: Install PyTorch run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests run: pytest tests --ignore=tests/test_ops build_cpu_py: runs-on: ubuntu-22.04 strategy: matrix: python-version: [3.8, 3.9, '3.10'] torch: [1.13.0] include: - torch: 1.13.0 torchvision: 0.14.0 steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install system dependencies run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg - name: Upgrade pip and wheel run: pip install pip wheel --upgrade - name: Install PyTorch run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests and generate coverage report run: | coverage run --branch --source mmcv -m pytest tests/ coverage xml coverage report -m build_cpu_pt: runs-on: ubuntu-22.04 strategy: matrix: python-version: [3.7] torch: [1.8.1, 1.9.1, 1.10.1, 1.11.0, 1.12.0, 1.13.0, 2.0.0, 2.1.0] include: - torch: 1.8.1 torchvision: 0.9.1 - torch: 1.9.1 torchvision: 0.10.1 - torch: 1.10.1 torchvision: 0.11.2 - torch: 1.11.0 torchvision: 0.12.0 - torch: 1.12.0 torchvision: 0.13.0 - torch: 1.13.0 torchvision: 0.14.0 - torch: 2.0.0 torchvision: 0.15.1 python-version: 3.8 - torch: 2.1.0 torchvision: 0.16.0 python-version: 3.8 exclude: - torch: 2.0.0 python-version: 3.7 - torch: 2.1.0 python-version: 3.7 steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install system dependencies run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg - name: Upgrade pip and wheel run: pip install pip wheel --upgrade - name: Install PyTorch run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests and generate coverage report run: | coverage run --branch --source mmcv -m pytest tests/ coverage xml coverage report -m # Only upload coverage report for python3.7 && pytorch1.8.1 cpu - name: Upload coverage to Codecov if: ${{matrix.torch == '1.8.1' && matrix.python-version == '3.8'}} uses: codecov/codecov-action@v1.0.14 with: file: ./coverage.xml flags: unittests env_vars: OS,PYTHON name: codecov-umbrella fail_ci_if_error: false build_cu102: runs-on: ubuntu-22.04 container: image: pytorch/pytorch:1.8.1-cuda10.2-cudnn7-devel env: FORCE_CUDA: 1 MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61 strategy: matrix: python-version: [3.7] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip and wheel run: pip install pip wheel --upgrade - name: Fetch GPG keys run: | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub - name: Install system dependencies run: apt-get update && apt-get install -y git ffmpeg libturbojpeg - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests and generate coverage report run: | coverage run --branch --source mmcv -m pytest tests/ coverage xml coverage report -m build_cu111: runs-on: ubuntu-22.04 container: image: pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel env: FORCE_CUDA: 1 MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61 strategy: matrix: python-version: [3.7] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip and wheel run: pip install pip wheel --upgrade - name: Fetch GPG keys run: | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub - name: Install system dependencies run: apt-get update && apt-get install -y git ffmpeg libturbojpeg - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests and generate coverage report run: | coverage run --branch --source mmcv -m pytest tests/ coverage xml coverage report -m build_cu116: runs-on: ubuntu-22.04 container: image: pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel env: FORCE_CUDA: 1 MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61 strategy: matrix: python-version: [3.7] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip and wheel run: pip install pip wheel --upgrade - name: Fetch GPG keys run: | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub - name: Install system dependencies run: apt-get update && apt-get install -y git ffmpeg libturbojpeg - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests and generate coverage report run: | coverage run --branch --source mmcv -m pytest tests coverage xml coverage report -m build_windows_without_ops: runs-on: windows-2019 env: MMCV_WITH_OPS: 0 strategy: matrix: python-version: [3.7] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip and wheel run: python -m pip install pip wheel --upgrade - name: Install PyTorch run: pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests run: pytest tests --ignore=tests/test_ops --ignore tests/test_image/test_io.py build_windows: runs-on: windows-2019 strategy: matrix: torch: [1.8.1, 2.1.0] include: - torch: 1.8.1 torchvision: 0.9.1 python-version: 3.7 - torch: 2.1.0 torchvision: 0.16.0 python-version: 3.8 steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip and wheel run: python -m pip install pip wheel --upgrade - name: Install PyTorch run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests run: pytest tests/ --ignore tests/test_image/test_io.py build_macos: runs-on: macos-latest strategy: matrix: torch: [1.8.1, 2.1.0] include: - torch: 1.8.1 torchvision: 0.9.1 python-version: 3.7 - torch: 2.1.0 torchvision: 0.16.0 python-version: 3.8 steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install system dependencies run: brew install ffmpeg jpeg-turbo - name: Upgrade pip and wheel run: pip install pip wheel --upgrade - name: Install PyTorch run: pip install torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }} - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests run: pytest tests/ ================================================ FILE: .github/workflows/pr_stage_test.yml ================================================ name: pr_stage_test on: pull_request: paths-ignore: - ".github/**.md" - "docker/**" - "docs/**" - 'examples/**' - '.dev_scripts/**' - "README.md" - "README_zh-CN.md" - "CONTRIBUTING.md" - ".pre-commit-config.yaml" - ".pre-commit-config-zh-cn.yaml" concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: build_cu121: runs-on: ubuntu-22.04 container: image: nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 strategy: matrix: python-version: ["3.10"] torch: ["2.5.1"] steps: - uses: actions/checkout@v3 - name: Install basic tools run: | apt-get update apt-get install -y wget build-essential git software-properties-common # 安装特定版本的 Python - name: Install Python ${{ matrix.python-version }} run: | add-apt-repository ppa:deadsnakes/ppa -y apt-get update apt-get install -y python${{ matrix.python-version }} python${{ matrix.python-version }}-dev python${{ matrix.python-version }}-distutils update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${{ matrix.python-version }} 1 update-alternatives --install /usr/bin/python python /usr/bin/python${{ matrix.python-version }} 1 wget https://bootstrap.pypa.io/get-pip.py python get-pip.py # 安装 PyTorch - name: Install PyTorch run: | pip install torch==${{ matrix.torch }} torchvision --index-url https://download.pytorch.org/whl/cu121 - name: Fetch GPG keys run: | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub - name: Install system dependencies run: apt-get update && apt-get install -y git ffmpeg libturbojpeg - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMEngine from source run: pip install -e . -v - name: Install unit tests dependencies run: | pip install -r requirements/test.txt # Distributed related unit test may randomly error in PyTorch 1.13.0 - name: Run unittests and generate coverage report run: | coverage run --branch --source mmengine -m pytest tests/ --ignore tests/test_dist/ coverage xml coverage report -m build_cu118: runs-on: ubuntu-22.04 container: image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 strategy: matrix: python-version: ["3.10"] torch: ["2.0.0", "2.1.0","2.3.1"] steps: - uses: actions/checkout@v3 - name: Install basic tools run: | apt-get update apt-get install -y wget build-essential git software-properties-common # 安装特定版本的 Python - name: Install Python ${{ matrix.python-version }} run: | add-apt-repository ppa:deadsnakes/ppa -y apt-get update apt-get install -y python${{ matrix.python-version }} python${{ matrix.python-version }}-dev python${{ matrix.python-version }}-distutils update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${{ matrix.python-version }} 1 update-alternatives --install /usr/bin/python python /usr/bin/python${{ matrix.python-version }} 1 wget https://bootstrap.pypa.io/get-pip.py python get-pip.py # 安装 PyTorch - name: Install PyTorch run: | pip install torch==${{ matrix.torch }} torchvision --index-url https://download.pytorch.org/whl/cu118 - name: Fetch GPG keys run: | apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub - name: Install system dependencies run: apt-get update && apt-get install -y git ffmpeg libturbojpeg - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMEngine from source run: pip install -e . -v - name: Install unit tests dependencies run: | pip install -r requirements/test.txt pip install numpy==1.24.3 # Distributed related unit test may randomly error in PyTorch 1.13.0 - name: Run unittests and generate coverage report run: | coverage run --branch --source mmengine -m pytest tests/ --ignore tests/test_dist/ coverage xml coverage report -m build_windows_without_ops: runs-on: windows-2019 env: MMCV_WITH_OPS: 0 strategy: matrix: python-version: [3.7] steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip and wheel run: python -m pip install pip wheel --upgrade - name: Install PyTorch run: pip install torch==1.8.1+cpu torchvision==0.9.1+cpu -f https://download.pytorch.org/whl/torch_stable.html - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests run: pytest tests --ignore=tests/test_ops --ignore tests/test_image/test_io.py build_windows: runs-on: windows-2019 strategy: matrix: torch: [1.8.1, 2.1.0] include: - torch: 1.8.1 torchvision: 0.9.1 python-version: 3.7 - torch: 2.1.0 torchvision: 0.16.0 python-version: 3.8 steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Upgrade pip and wheel run: python -m pip install pip wheel --upgrade - name: Install PyTorch run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html - name: Install MMEngine from main branch run: pip install git+https://github.com/open-mmlab/mmengine.git@main - name: Install ninja to speed the compilation run: pip install ninja psutil - name: Build MMCV from source run: pip install -e . -v - name: Install unit tests dependencies run: pip install -r requirements/test.txt - name: Run unit tests run: pytest tests/ --ignore tests/test_image/test_io.py # build_macos: # runs-on: macos-latest # strategy: # matrix: # torch: [1.8.1, 2.1.0] # include: # - torch: 1.8.1 # torchvision: 0.9.1 # python-version: 3.7.1 # - torch: 2.1.0 # torchvision: 0.16.0 # python-version: 3.8.1 # steps: # - uses: actions/checkout@v2 # - name: Set up Python ${{ matrix.python-version }} # uses: actions/setup-python@v2 # with: # python-version: ${{ matrix.python-version }} # - name: Install system dependencies # run: brew install ffmpeg jpeg-turbo # - name: Upgrade pip and wheel # run: pip install pip wheel --upgrade # - name: Install PyTorch # run: pip install torch==${{ matrix.torch }} torchvision==${{ matrix.torchvision }} # - name: Install MMEngine from main branch # run: pip install git+https://github.com/open-mmlab/mmengine.git@main # - name: Install ninja to speed the compilation # run: pip install ninja psutil # - name: Build MMCV from source # run: pip install -e . -v # - name: Install unit tests dependencies # run: pip install -r requirements/test.txt # - name: Run unit tests # run: pytest tests/ ================================================ FILE: .github/workflows/publish-to-pypi.yml ================================================ name: deploy on: push concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: build-n-publish_without_ops: runs-on: ubuntu-22.04 if: startsWith(github.event.ref, 'refs/tags') steps: - uses: actions/checkout@v2 - name: Set up Python 3.7 uses: actions/setup-python@v1 with: python-version: 3.7 - name: Upgrade Setuptools run: pip install setuptools wheel --upgrade - name: Build MMCV run: | sed -i "s/os.getenv('MMCV_WITH_OPS', '1')/os.getenv('MMCV_WITH_OPS', '0')/g" setup.py python setup.py sdist bdist_wheel - name: Publish distribution to PyPI run: | pip install twine twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }} build-n-publish_with_ops: runs-on: ubuntu-22.04 if: startsWith(github.event.ref, 'refs/tags') steps: - uses: actions/checkout@v2 - name: Set up Python 3.7 uses: actions/setup-python@v1 with: python-version: 3.7 - name: Upgrade Setuptools run: pip install setuptools --upgrade - name: Build MMCV with ops run: python setup.py sdist - name: Publish distribution to PyPI run: | pip install twine twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }} ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # PyTorch checkpoint *.pth # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST mlu-ops/ mlu-ops.* # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/en/_build/ docs/en/api/generated/ docs/zh_cn/_build/ docs/zh_cn/api/generated/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # editors and IDEs .idea/ .vscode/ # custom .DS_Store # datasets and logs and checkpoints data/ work_dir/ src/ ================================================ FILE: .pre-commit-config-zh-cn.yaml ================================================ exclude: ^tests/data/ repos: - repo: https://github.com/pre-commit/pre-commit rev: v4.0.0 hooks: - id: validate_manifest - repo: https://github.com/PyCQA/flake8 rev: 7.1.1 hooks: - id: flake8 - repo: https://gitee.com/openmmlab/mirrors-isort rev: 5.11.5 hooks: - id: isort - repo: https://gitee.com/openmmlab/mirrors-yapf rev: v0.32.0 hooks: - id: yapf - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks rev: v5.0.0 hooks: - id: trailing-whitespace - id: check-yaml - id: end-of-file-fixer - id: requirements-txt-fixer - id: double-quote-string-fixer - id: check-merge-conflict - id: fix-encoding-pragma args: ["--remove"] - id: mixed-line-ending args: ["--fix=lf"] - repo: https://gitee.com/openmmlab/mirrors-codespell rev: v2.2.1 hooks: - id: codespell - repo: https://gitee.com/openmmlab/mirrors-mdformat rev: 0.7.9 hooks: - id: mdformat args: ["--number"] additional_dependencies: - mdformat-openmmlab - mdformat_frontmatter - linkify-it-py - repo: https://gitee.com/openmmlab/mirrors-docformatter # TODO:https://github.com/PyCQA/docformatter/issues/289 rev: v1.3.1 hooks: - id: docformatter args: ["--in-place", "--wrap-descriptions", "79"] - repo: https://github.com/asottile/pyupgrade rev: v3.0.0 hooks: - id: pyupgrade args: ["--py36-plus"] - repo: https://gitee.com/openmmlab/pre-commit-hooks rev: v0.2.0 # Use the ref you want to point at hooks: - id: check-copyright args: ["mmcv", "tests", "--excludes", "mmcv/ops"] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.2.0 hooks: - id: mypy exclude: |- (?x)( ^test | ^docs ) additional_dependencies: ["types-setuptools", "types-requests"] # - repo: local # hooks: # - id: clang-format # name: clang-format # description: Format files with ClangFormat # entry: clang-format -style=google -i # language: system # files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ ================================================ FILE: .pre-commit-config.yaml ================================================ exclude: ^tests/data/ repos: - repo: https://github.com/pre-commit/pre-commit rev: v4.0.0 hooks: - id: validate_manifest - repo: https://github.com/PyCQA/flake8 rev: 7.1.1 hooks: - id: flake8 - repo: https://github.com/PyCQA/isort rev: 5.11.5 hooks: - id: isort - repo: https://github.com/pre-commit/mirrors-yapf rev: v0.32.0 hooks: - id: yapf - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: trailing-whitespace - id: check-yaml - id: end-of-file-fixer - id: requirements-txt-fixer - id: double-quote-string-fixer - id: check-merge-conflict - id: fix-encoding-pragma args: ["--remove"] - id: mixed-line-ending args: ["--fix=lf"] - repo: https://github.com/codespell-project/codespell rev: v2.2.1 hooks: - id: codespell - repo: https://github.com/executablebooks/mdformat rev: 0.7.9 hooks: - id: mdformat args: ["--number"] additional_dependencies: - mdformat-openmmlab - mdformat_frontmatter - linkify-it-py - repo: https://github.com/myint/docformatter rev: 06907d0 hooks: - id: docformatter args: ["--in-place", "--wrap-descriptions", "79"] - repo: https://github.com/asottile/pyupgrade rev: v3.0.0 hooks: - id: pyupgrade args: ["--py36-plus"] - repo: https://github.com/open-mmlab/pre-commit-hooks rev: v0.2.0 # Use the ref you want to point at hooks: - id: check-copyright args: ["mmcv", "tests", "--excludes", "mmcv/ops"] - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.2.0 hooks: - id: mypy exclude: |- (?x)( ^test | ^docs ) additional_dependencies: ["types-setuptools", "types-requests"] # - repo: local # hooks: # - id: clang-format # name: clang-format # description: Format files with ClangFormat # entry: clang-format -style=google -i # language: system # files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$ ================================================ FILE: .readthedocs.yml ================================================ version: 2 formats: all build: os: ubuntu-22.04 tools: python: "3.7" python: install: - requirements: requirements/runtime.txt - requirements: requirements/docs.txt ================================================ FILE: CITATION.cff ================================================ cff-version: 1.2.0 message: "If you use this software, please cite it as below." authors: - name: "MMCV Contributors" title: "OpenMMLab Computer Vision Foundation" date-released: 2018-08-22 url: "https://github.com/open-mmlab/mmcv" license: Apache-2.0 ================================================ FILE: CONTRIBUTING.md ================================================ ## Contributing to OpenMMLab Welcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to **Fix bug** You can directly post a Pull Request to fix typo in code or documents The steps to fix the bug of code implementation are as follows. 1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution. 2. Posting a pull request after fixing the bug and adding corresponding unit test. **New Feature or Enhancement** 1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design. 2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test. **Document** You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable. ### Pull Request Workflow If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) #### 1. Fork and clone If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile. Then, you can clone the repositories to local: ```shell git clone git@github.com:{username}/mmcv.git ``` After that, you should ddd official repository as the upstream repository ```bash git remote add upstream git@github.com:open-mmlab/mmcv ``` Check whether remote repository has been added successfully by `git remote -v` ```bash origin git@github.com:{username}/mmcv.git (fetch) origin git@github.com:{username}/mmcv.git (push) upstream git@github.com:open-mmlab/mmcv (fetch) upstream git@github.com:open-mmlab/mmcv (push) ``` > Here's a brief introduction to origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically. #### 2. Configure pre-commit You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory. ```shell pip install -U pre-commit pre-commit install ``` Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`. ```shell pre-commit run --all-files ``` If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation. If the code does not conform to the code style specification, pre-commit will raise a warning and fixes some of the errors automatically. If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**). ```shell git commit -m "xxx" --no-verify ``` #### 3. Create a development branch After configuring the pre-commit, we should create a branch based on the master branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name` ```shell git checkout -b yhc/refactor_contributing_doc ``` In subsequent development, if the master branch of the local repository is behind the master branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command: ```shell git pull upstream master ``` #### 4. Commit the code and pass the unit test - MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html). - The committed code should pass through the unit test ```shell # Pass all unit tests pytest tests # Pass the unit test of runner pytest tests/test_runner/test_runner.py ``` If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test) - If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering) #### 5. Push the code to remote We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option. ```shell git push -u origin {branch_name} ``` This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository. #### 6. Create a Pull Request (1) Create a pull request in GitHub's Pull request interface (2) Modify the PR description according to the guidelines so that other developers can better understand your changes Find more details about Pull Request description in [pull request guidelines](#pr-specs). **note** (a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) (b) If it is your first contribution, please sign the CLA (c) Check whether the Pull Request pass through the CI MMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code. (3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP. #### 7. Resolve conflicts If your local branch conflicts with the latest master branch of "upstream", you'll need to resolove them. There are two ways to do this: ```shell git fetch --all --prune git rebase upstream/master ``` or ```shell git fetch --all --prune git merge upstream/master ``` If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts. ### Guidance #### Unit test If you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) module, you can try to install the following dependencies: ```shell # Linux sudo apt-get update -y sudo apt-get install -y libturbojpeg sudo apt-get install -y ffmpeg # Windows conda install ffmpeg ``` We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test: ```shell python -m coverage run -m pytest /path/to/test_file python -m coverage html # check file in htmlcov/index.html ``` #### Document rendering If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results: ```shell pip install -r requirements/docs.txt cd docs/zh_cn/ # or docs/en make html # check file in ./docs/zh_cn/_build/html/index.html ``` ### Code style #### Python We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style. We use the following tools for linting and formatting: - [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools. - [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports. - [yapf](https://github.com/google/yapf): A formatter for Python files. - [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files. - [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files. - [docformatter](https://github.com/myint/docformatter): A formatter to format docstring. Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg). We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit. The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml). #### C++ and CUDA We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). ### PR Specs 1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style 2. One short-time branch should be matched with only one PR 3. Accomplish a detailed change in one PR. Avoid large PR - Bad: Support Faster R-CNN - Acceptable: Add a box head to Faster R-CNN - Good: Add a parameter to box head to support custom conv-layer number 4. Provide clear and significant commit message 5. Provide clear and meaningful PR description - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix) - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily) - Introduce main changes, results and influences on other modules in short description - Associate related issues and pull requests with a milestone ================================================ FILE: CONTRIBUTING_zh-CN.md ================================================ ## 贡献代码 欢迎加入 MMCV 社区,我们致力于打造最前沿的计算机视觉基础库,我们欢迎任何类型的贡献,包括但不限于 **修复错误** 修复代码实现错误的步骤如下: 1. 如果提交的代码改动较大,建议先提交 issue,并正确描述 issue 的现象、原因和复现方式,讨论后确认修复方案。 2. 修复错误并补充相应的单元测试,提交拉取请求。 **新增功能或组件** 1. 如果新功能或模块涉及较大的代码改动,建议先提交 issue,确认功能的必要性。 2. 实现新增功能并添单元测试,提交拉取请求。 **文档补充** 修复文档可以直接提交拉取请求 添加文档或将文档翻译成其他语言步骤如下 1. 提交 issue,确认添加文档的必要性。 2. 添加文档,提交拉取请求。 ### 拉取请求工作流 如果你对拉取请求不了解,没关系,接下来的内容将会从零开始,一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式,可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) #### 1. 复刻仓库 当你第一次提交拉取请求时,先复刻 OpenMMLab 原代码库,点击 GitHub 页面右上角的 **Fork** 按钮,复刻后的代码库将会出现在你的 GitHub 个人主页下。 将代码克隆到本地 ```shell git clone git@github.com:{username}/mmcv.git ``` 添加原代码库为上游代码库 ```bash git remote add upstream git@github.com:open-mmlab/mmcv ``` 检查 remote 是否添加成功,在终端输入 `git remote -v` ```bash origin git@github.com:{username}/mmcv.git (fetch) origin git@github.com:{username}/mmcv.git (push) upstream git@github.com:open-mmlab/mmcv (fetch) upstream git@github.com:open-mmlab/mmcv (push) ``` > 这里对 origin 和 upstream 进行一个简单的介绍,当我们使用 git clone 来克隆代码时,会默认创建一个 origin 的 remote,它指向我们克隆的代码库地址,而 upstream 则是我们自己添加的,用来指向原始代码库地址。当然如果你不喜欢他叫 upstream,也可以自己修改,比如叫 open-mmlab。我们通常向 origin 提交代码(即 fork 下来的远程仓库),然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突,再从 upstream 拉取最新的代码,和本地分支解决冲突,再提交到 origin。 #### 2. 配置 pre-commit 在本地开发环境中,我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格,以确保代码风格的统一。在提交代码,需要先安装 pre-commit(需要在 MMCV 目录下执行): ```shell pip install -U pre-commit pre-commit install ``` 检查 pre-commit 是否配置成功,并安装 `.pre-commit-config.yaml` 中的钩子: ```shell pre-commit run --all-files ``` > 如果你是中国用户,由于网络原因,可能会出现安装失败的情况,这时可以使用国内源 > pre-commit install -c .pre-commit-config-zh-cn.yaml > pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml 如果安装过程被中断,可以重复执行 `pre-commit run ...` 继续安装。 如果提交的代码不符合代码风格规范,pre-commit 会发出警告,并自动修复部分错误。 如果我们想临时绕开 pre-commit 的检查提交一次代码,可以在 `git commit` 时加上 `--no-verify`(需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查)。 ```shell git commit -m "xxx" --no-verify ``` #### 3. 创建开发分支 安装完 pre-commit 之后,我们需要基于 master 创建开发分支,建议的分支命名规则为 `username/pr_name`。 ```shell git checkout -b yhc/refactor_contributing_doc ``` 在后续的开发中,如果本地仓库的 master 分支落后于 upstream 的 master 分支,我们需要先拉取 upstream 的代码进行同步,再执行上面的命令 ```shell git pull upstream master ``` #### 4. 提交代码并在本地通过单元测试 - MMCV 引入了 mypy 来做静态类型检查,以增加代码的鲁棒性。因此我们在提交代码时,需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。 - 提交的代码同样需要通过单元测试 ```shell # 通过全量单元测试 pytest tests # 我们需要保证提交的代码能够通过修改模块的单元测试,以 runner 为例 pytest tests/test_runner/test_runner.py ``` 如果你由于缺少依赖无法运行修改模块的单元测试,可以参考[指引-单元测试](#单元测试) - 如果修改/添加了文档,参考[指引](#文档渲染)确认文档渲染正常。 #### 5. 推送代码到远程 代码通过单元测试和 pre-commit 检查后,将代码推送到远程仓库,如果是第一次推送,可以在 `git push` 后加上 `-u` 参数以关联远程分支 ```shell git push -u origin {branch_name} ``` 这样下次就可以直接使用 `git push` 命令推送代码了,而无需指定分支和远程仓库。 #### 6. 提交拉取请求(PR) (1) 在 GitHub 的 Pull request 界面创建拉取请求 (2) 根据指引修改 PR 描述,以便于其他开发者更好地理解你的修改 描述规范详见[拉取请求规范](#拉取请求规范)   **注意事项** (a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响,并关联相关 Issue(具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)) (b) 如果是第一次为 OpenMMLab 做贡献,需要签署 CLA (c) 检查提交的 PR 是否通过 CI(集成测试) MMCV 会在不同的平台(Linux、Window、Mac),基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试,以保证代码的正确性,如果有任何一个没有通过,我们可点击上图中的 `Details` 来查看具体的测试信息,以便于我们修改代码。 (3) 如果 PR 通过了 CI,那么就可以等待其他开发者的 review,并根据 reviewer 的意见,修改代码,并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤,直到 reviewer 同意合入 PR。 所有 reviewer 同意合入 PR 后,我们会尽快将 PR 合并到主分支。 #### 7. 解决冲突 随着时间的推移,我们的代码库会不断更新,这时候,如果你的 PR 与主分支存在冲突,你需要解决冲突,解决冲突的方式有两种: ```shell git fetch --all --prune git rebase upstream/master ``` 或者 ```shell git fetch --all --prune git merge upstream/master ``` 如果你非常善于处理冲突,那么可以使用 rebase 的方式来解决冲突,因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用,那么可以使用 `merge` 的方式来解决冲突。 ### 指引 #### 单元测试 如果你无法正常执行部分模块的单元测试,例如 [video](https://github.com/open-mmlab/mmcv/tree/master/mmcv/video) 模块,可能是你的当前环境没有安装以下依赖 ```shell # Linux sudo apt-get update -y sudo apt-get install -y libturbojpeg sudo apt-get install -y ffmpeg # Windows conda install ffmpeg ``` 在提交修复代码错误或新增特性的拉取请求时,我们应该尽可能的让单元测试覆盖所有提交的代码,计算单元测试覆盖率的方法如下 ```shell python -m coverage run -m pytest /path/to/test_file python -m coverage html # check file in htmlcov/index.html ``` #### 文档渲染 在提交修复代码错误或新增特性的拉取请求时,可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。 本地生成渲染后的文档的方法如下 ```shell pip install -r requirements/docs.txt cd docs/zh_cn/ # or docs/en make html # check file in ./docs/zh_cn/_build/html/index.html ``` ### 代码风格 #### Python [PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范,我们使用以下工具检查和格式化代码 - [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具,是多个检查工具的封装 - [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具 - [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具 - [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误 - [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具 - [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具 yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到 通过配置 [pre-commit hook](https://pre-commit.com/) ,我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`, 修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`,调整 `requirments.txt` 的包顺序。 pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。 pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。 更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。 #### C++ and CUDA C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) ### 拉取请求规范 1. 使用 [pre-commit hook](https://pre-commit.com),尽量减少代码风格相关问题 2. 一个`拉取请求`对应一个短期分支 3. 粒度要细,一个`拉取请求`只做一件事情,避免超大的`拉取请求` - Bad:实现 Faster R-CNN - Acceptable:给 Faster R-CNN 添加一个 box head - Good:给 box head 增加一个参数来支持自定义的 conv 层数 4. 每次 Commit 时需要提供清晰且有意义 commit 信息 5. 提供清晰且有意义的`拉取请求`描述 - 标题写明白任务名称,一般格式:\[Prefix\] Short description of the pull request (Suffix) - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review) - 描述里介绍`拉取请求`的主要修改内容,结果,以及对其他部分的影响, 参考`拉取请求`模板 - 关联相关的`议题` (issue) 和其他`拉取请求` 6. 如果引入了其他三方库,或借鉴了三方库的代码,请确认他们的许可证和 mmcv 兼容,并在借鉴的代码上补充 `This code is inspired from http://` ================================================ FILE: LICENSE ================================================ Copyright (c) OpenMMLab. All rights reserved Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2018-2020 Open-MMLab. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: LICENSES.md ================================================ # Licenses for special operations In this file, we list the operations with other licenses instead of Apache 2.0. Users should be careful about adopting these operations in any commercial matters. | Operation | Files | License | | :--------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: | | upfirdn2d | [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu) | NVIDIA License | | fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License | | bias_act | [mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/bias_act_cuda.cu) | NVIDIA License | | filtered_lrelu | [mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/csrc/pytorch/cuda/filtered_lrelu.cu) | NVIDIA License | | conv2d_gradfix | [mmcv/ops/conv2d_gradfix.py](https://github.com/open-mmlab/mmcv/tree/2.x/mmcv/ops/conv2d_gradfix.py) | NVIDIA License | ================================================ FILE: MANIFEST.in ================================================ include requirements/runtime.txt include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm ================================================ FILE: README.md ================================================
 
OpenMMLab website HOT      OpenMMLab platform TRY IT OUT
 
[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) [![pytorch](https://img.shields.io/badge/pytorch-1.8~2.0-orange)](https://pytorch.org/get-started/previous-versions/) [![cuda](https://img.shields.io/badge/cuda-10.1~11.8-green)](https://developer.nvidia.com/cuda-downloads) [![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) [![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) [![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE) [📘Documentation](https://mmcv.readthedocs.io/en/latest/) | [🛠️Installation](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) | [🤔Reporting Issues](https://github.com/open-mmlab/mmcv/issues/new/choose)
English | [简体中文](README_zh-CN.md)
## Highlights The OpenMMLab team released a new generation of training engine [MMEngine](https://github.com/open-mmlab/mmengine) at the World Artificial Intelligence Conference on September 1, 2022. It is a foundational library for training deep learning models. Compared with MMCV, it provides a universal and powerful runner, an open architecture with a more unified interface, and a more customizable training process. MMCV v2.0.0 official version was released on April 6, 2023. In version 2.x, it removed components related to the training process and added a data transformation module. Also, starting from 2.x, it renamed the package names **mmcv** to **mmcv-lite** and **mmcv-full** to **mmcv**. For details, see [Compatibility Documentation](docs/en/compatibility.md). MMCV will maintain both [1.x](https://github.com/open-mmlab/mmcv/tree/1.x) (corresponding to the original [master](https://github.com/open-mmlab/mmcv/tree/master) branch) and **2.x** (corresponding to the **main** branch, now the default branch) versions simultaneously. For details, see [Branch Maintenance Plan](README.md#branch-maintenance-plan). ## Introduction MMCV is a foundational library for computer vision research and it provides the following functionalities: - [Image/Video processing](https://mmcv.readthedocs.io/en/latest/understand_mmcv/data_process.html) - [Image and annotation visualization](https://mmcv.readthedocs.io/en/latest/understand_mmcv/visualization.html) - [Image transformation](https://mmcv.readthedocs.io/en/latest/understand_mmcv/data_transform.html) - [Various CNN architectures](https://mmcv.readthedocs.io/en/latest/understand_mmcv/cnn.html) - [High-quality implementation of common CPU and CUDA ops](https://mmcv.readthedocs.io/en/latest/understand_mmcv/ops.html) It supports the following systems: - Linux - Windows - macOS See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage. Note: MMCV requires Python 3.7+. ## Installation There are two versions of MMCV: - **mmcv**: comprehensive, with full features and various CUDA ops out of the box. It takes longer time to build. - **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops. **Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`. ### Install mmcv Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation). For apple silicon users, please use PyTorch 1.13+. The command to install mmcv: ```bash pip install -U openmim mim install mmcv ``` If you need to specify the version of mmcv, you can use the following command: ```bash mim install mmcv==2.0.0 ``` If you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html).
Installation log using pre-built packages Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
Collecting mmcv
Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl
Installation log using source packages Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
Collecting mmcv==2.0.0
Downloading mmcv-2.0.0.tar.gz
For more installation methods, please refer to the [Installation documentation](https://mmcv.readthedocs.io/en/latest/get_started/installation.html). ### Install mmcv-lite If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation). ```bash pip install -U openmim mim install mmcv-lite ``` ## FAQ If you face some installation issues, CUDA related issues or RuntimeErrors, you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html). If you face installation problems or runtime issues, you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html) to see if there is a solution. If the problem is still not solved, feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues). ## Citation If you find this project useful in your research, please consider cite: ```latex @misc{mmcv, title={{MMCV: OpenMMLab} Computer Vision Foundation}, author={MMCV Contributors}, howpublished = {\url{https://github.com/open-mmlab/mmcv}}, year={2018} } ``` ## Contributing We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline. ## License MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters. ## Branch Maintenance Plan MMCV currently has four branches, namely main, 1.x, master, and 2.x, where 2.x is an alias for the main branch, and master is an alias for the 1.x branch. The 2.x and master branches will be deleted in the future. MMCV's branches go through the following three stages: | Phase | Time | Branch | description | | -------------------- | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ | | RC Period | 2022.9.1 - 2023.4.5 | Release candidate code (2.x version) will be released on 2.x branch. Default master branch is still 1.x version | Master and 2.x branches iterate normally | | Compatibility Period | 2023.4.6 - 2023.12.31 | **The 2.x branch has been renamed to the main branch and set as the default branch**, and 1.x branch will correspond to 1.x version | We still maintain the old version 1.x, respond to user needs, but try not to introduce changes that break compatibility; main branch iterates normally | | Maintenance Period | From 2024/1/1 | Default main branch corresponds to 2.x version and 1.x branch is 1.x version | 1.x branch is in maintenance phase, no more new feature support; main branch is iterating normally | ## Projects in OpenMMLab - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models. - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision. - [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages. - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark. - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. - [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark. - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. - [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark. - [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark. - [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark. - [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark. - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox. - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework. ================================================ FILE: README_zh-CN.md ================================================
 
OpenMMLab 官网 HOT      OpenMMLab 开放平台 TRY IT OUT
 
[![platform](https://img.shields.io/badge/platform-Linux%7CWindows%7CmacOS-blue)](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) [![pytorch](https://img.shields.io/badge/pytorch-1.8~2.0-orange)](https://pytorch.org/get-started/previous-versions/) [![cuda](https://img.shields.io/badge/cuda-10.1~11.8-green)](https://developer.nvidia.com/cuda-downloads) [![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) [![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) [![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE) [📘使用文档](https://mmcv.readthedocs.io/zh_CN/latest/) | [🛠️安装教程](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html) | [🤔报告问题](https://github.com/open-mmlab/mmcv/issues/new/choose)
[English](README.md) | 简体中文
## Highlights OpenMMLab 团队于 2022 年 9 月 1 日在世界人工智能大会发布了新一代训练引擎 [MMEngine](https://github.com/open-mmlab/mmengine),它是一个用于训练深度学习模型的基础库。相比于 MMCV,它提供了更高级且通用的训练器、接口更加统一的开放架构以及可定制化程度更高的训练流程。 MMCV v2.0.0 正式版本于 2023 年 4 月 6 日发布。在 2.x 版本中,它删除了和训练流程相关的组件,并新增了数据变换模块。另外,从 2.x 版本开始,重命名包名 **mmcv** 为 **mmcv-lite** 以及 **mmcv-full** 为 **mmcv**。详情见[兼容性文档](docs/zh_cn/compatibility.md)。 MMCV 会同时维护 [1.x](https://github.com/open-mmlab/mmcv/tree/1.x) (对应原 [master](https://github.com/open-mmlab/mmcv/tree/master) 分支) 和 **2.x**(对应 **main** 分支,现为默认分支)版本,详情见[分支维护计划](README_zh-CN.md#分支维护计划)。 ## 简介 MMCV 是一个面向计算机视觉的基础库,它提供了以下功能: - [图像和视频处理](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/data_process.html) - [图像和标注结果可视化](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/visualization.html) - [图像变换](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/data_transform.html) - [多种 CNN 网络结构](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/cnn.html) - [高质量实现的常见 CUDA 算子](https://mmcv.readthedocs.io/zh_CN/latest/understand_mmcv/ops.html) MMCV 支持多种平台,包括: - Linux - Windows - macOS 如想了解更多特性和使用,请参考[文档](http://mmcv.readthedocs.io/zh_CN/latest)。 提示: MMCV 需要 Python 3.7 以上版本。 ## 安装 MMCV 有两个版本: - **mmcv**: 完整版,包含所有的特性以及丰富的开箱即用的 CUDA 算子。注意完整版本可能需要更长时间来编译。 - **mmcv-lite**: 精简版,不包含 CUDA 算子但包含其余所有特性和功能,类似 MMCV 1.0 之前的版本。如果你不需要使用 CUDA 算子的话,精简版可以作为一个考虑选项。 **注意**: 请不要在同一个环境中安装两个版本,否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前,需要先卸载另一个。`如果 CUDA 可用,强烈推荐安装 mmcv`。 ### 安装 mmcv 在安装 mmcv 之前,请确保 PyTorch 已经成功安装在环境中,可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。如果你使用的是搭载 apple silicon 的 mac 设备,请安装 PyTorch 1.13+ 的版本。 安装 mmcv 的命令如下: ```bash pip install -U openmim mim install mmcv ``` 如果需要指定 mmcv 的版本,可以使用以下命令 ```bash mim install mmcv==2.0.0 ``` 如果发现上述的安装命令没有使用预编译包(以 `.whl` 结尾)而是使用源码包(以 `.tar.gz` 结尾)安装,则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包,此时,你可以[源码安装 mmcv](https://mmcv.readthedocs.io/zh_CN/latest/get_started/build.html)。
使用预编译包的安装日志 Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
Collecting mmcv
Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl
使用源码包的安装日志 Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
Collecting mmcv==2.0.0
Downloading mmcv-2.0.0.tar.gz
更多安装方式请参考[安装文档](https://mmcv.readthedocs.io/zh_CN/latest/get_started/installation.html)。 ### 安装 mmcv-lite 如果你需要使用和 PyTorch 相关的模块,请确保 PyTorch 已经成功安装在环境中,可以参考 [PyTorch 官方安装文档](https://github.com/pytorch/pytorch#installation)。 ```bash pip install -U openmim mim install mmcv-lite ``` ## FAQ 如果你遇到了安装问题或者运行时问题,请查看[问题解决页面](https://mmcv.readthedocs.io/zh_CN/latest/faq.html)是否已有解决方案。如果问题仍然没有解决,欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。 ## 贡献指南 我们感谢所有的贡献者为改进和提升 MMCV 所作出的努力。请参考[贡献指南](CONTRIBUTING.md)来了解参与项目贡献的相关指引。 ## 许可证 `MMCV` 目前以 Apache 2.0 的许可证发布,但是其中有一部分功能并不是使用的 Apache2.0 许可证,我们在 [许可证](LICENSES.md) 中详细地列出了这些功能以及他们对应的许可证,如果您正在从事盈利性活动,请谨慎参考此文档。 ## 分支维护计划 MMCV 目前有四个分支,分别是 main、1.x、master 和 2.x,其中 2.x 为 main 分支的别名,master 为 1.x 分支的别名,2.x 和 master 这两个分支在将来会被删除。MMCV 的分支经历以下三个阶段: | 阶段 | 时间 | 分支 | 说明 | | ------ | --------------------- | --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | | 公测期 | 2022.9.1 - 2023.4.5 | 公测版代码发布在 2.x 分支;默认主分支 master 仍对应 1.x 版本 | master 和 2.x 分支正常进行迭代 | | 兼容期 | 2023.4.6 - 2023.12.31 | **2.x 分支重命名为 main 分支并设置为默认分支**;1.x 分支对应 1.x 版本 | 保持对旧版本 1.x 的维护和开发,响应用户需求,但尽量不引进破坏旧版本兼容性的改动;main 分支正常进行迭代 | | 维护期 | 2024.1.1 - 待定 | 默认主分支 main 为 2.x 版本;1.x 分支对应 1.x 版本 | 1.x 分支进入维护阶段,不再进行新功能支持;main 分支正常进行迭代 | ## OpenMMLab 的其他项目 - [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库 - [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准 - [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱 - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱 - [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱 - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准 - [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准 - [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准 - [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱 - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台 - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准 - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱 - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱 - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架 ## 欢迎加入 OpenMMLab 社区 扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=K0QI8ByU),或添加微信小助手”OpenMMLabwx“加入官方交流微信群。
我们会在 OpenMMLab 社区为大家 - 📢 分享 AI 框架的前沿核心技术 - 💻 解读 PyTorch 常用模块源码 - 📰 发布 OpenMMLab 的相关新闻 - 🚀 介绍 OpenMMLab 开发的前沿算法 - 🏃 获取更高效的问题答疑和意见反馈 - 🔥 提供与各行各业开发者充分交流的平台 干货满满 📘,等你来撩 💗,OpenMMLab 社区期待您的加入 👬 ================================================ FILE: TERMINOLOGY.md ================================================ # English-Chinese terminology comparison (英汉术语对照) This document is used as a reference for English-Chinese terminology translation. 该文档用作中英文翻译对照参考。 | English | 中文 | | :---------------: | :----------: | | annotation | 标注 | | backbone | 主干网络 | | benchmark | 基准测试 | | checkpoint | 模型权重文件 | | classifier | 分类器 | | cls_head | 分类头 | | decoder | 解码器 | | detector | 检测器 | | encoder | 编码器 | | finetune | 微调 | | ground truth | 真实标签 | | hook | 钩子 | | localizer | 定位器 | | neck | 模型颈部 | | pipeline | 流水线 | | recognizer | 识别器 | | register | 注册器 | | schedule | 调整 | | scheduler | 调度器 | | segmentor | 分割器 | | tensor | 张量 | | training schedule | 训练策略 | ================================================ FILE: docker/README.md ================================================ # Docker images There are two `Dockerfile` files to build docker images, one to build an image with the mmcv pre-built package and the other with the mmcv development environment. ```text . |-- README.md |-- dev # build with mmcv development environment | `-- Dockerfile `-- release # build with mmcv pre-built package `-- Dockerfile ``` ## Build docker images ### Build with mmcv pre-built package Build with local repository ```bash git clone https://github.com/open-mmlab/mmcv.git && cd mmcv docker build -t mmcv -f docker/release/Dockerfile . ``` Or build with remote repository ```bash docker build -t mmcv https://github.com/open-mmlab/mmcv.git#master:docker/release ``` The [Dockerfile](release/Dockerfile) installs latest released version of mmcv by default, but you can specify mmcv versions to install expected versions. ```bash docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0rc1 . ``` If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images. An example to build an image with PyTorch 1.11 and CUDA 11.3. ```bash docker build -t mmcv -f docker/release/Dockerfile \ --build-arg PYTORCH=1.9.0 \ --build-arg CUDA=11.1 \ --build-arg CUDNN=8 \ --build-arg MMCV=2.0.0rc1 . ``` More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags). ### Build with mmcv development environment If you want to build an docker image with the mmcv development environment, you can use the following command ```bash git clone https://github.com/open-mmlab/mmcv.git && cd mmcv docker build -t mmcv -f docker/dev/Dockerfile --build-arg CUDA_ARCH=7.5 . ``` Note that `CUDA_ARCH` is the cumpute capability of your GPU and you can find it at [Compute Capability](https://developer.nvidia.com/cuda-gpus#compute). The building process may take 10 minutes or more. ## Run images ```bash docker run --gpus all --shm-size=8g -it mmcv ``` See [docker run](https://docs.docker.com/engine/reference/commandline/run/) for more usages. ================================================ FILE: docker/dev/Dockerfile ================================================ ARG PYTORCH="1.8.1" ARG CUDA="10.2" ARG CUDNN="7" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel # To fix GPG key error when running apt-get update RUN rm /etc/apt/sources.list.d/cuda.list \ && rm /etc/apt/sources.list.d/nvidia-ml.list \ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub # Install git and system dependencies for opencv-python RUN apt-get update && apt-get install -y git \ && apt-get update && apt-get install -y libgl1 libglib2.0-0 # Install system dependencies for unit tests RUN apt-get install -y ffmpeg libturbojpeg \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # build mmcv from source with develop mode ARG HTTPS_PROXY="" ENV https_proxy=${HTTPS_PROXY} ENV FORCE_CUDA="1" ARG CUDA_ARCH="" ENV TORCH_CUDA_ARCH_LIST=${CUDA_ARCH} RUN git clone https://github.com/open-mmlab/mmcv.git /mmcv WORKDIR /mmcv RUN git checkout 2.x && git rev-parse --short HEAD RUN pip install --no-cache-dir -e .[all] -v && pip install pre-commit && pre-commit install ================================================ FILE: docker/release/Dockerfile ================================================ ARG PYTORCH="1.8.1" ARG CUDA="10.2" ARG CUDNN="7" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel # To fix GPG key error when running apt-get update RUN rm /etc/apt/sources.list.d/cuda.list \ && rm /etc/apt/sources.list.d/nvidia-ml.list \ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub # Install system dependencies for opencv-python RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # Install mmcv ARG MMCV="" RUN if [ "${MMCV}" = "" ]; then pip install -U openmim && mim install 'mmcv>=2.0.0rc1'; else pip install -U openmim && mim install mmcv==${MMCV}; fi # Verify the installation RUN python -c 'import mmcv;print(mmcv.__version__)' ================================================ FILE: docs/en/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/en/_static/css/readthedocs.css ================================================ .header-logo { background-image: url("../image/mmcv-logo.png"); background-size: 85px 40px; height: 40px; width: 85px; } table.colwidths-auto td { width: 50% } ================================================ FILE: docs/en/_static/version.json ================================================ { "Linux": [ { "cuda": "12.1", "torch": "2.4.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "11.8", "torch": "2.4.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "11.8", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "11.7", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "11.7", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "11.6", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "11.6", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.5", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.0", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.2", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.2", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.2", "torch": "1.5.x", "mmcv": [ "2.0.0rc3" ] }, { "cuda": "10.1", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.1", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.1", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.1", "torch": "1.5.x", "mmcv": [ "2.0.0rc3" ] }, { "cuda": "9.2", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "9.2", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "9.2", "torch": "1.5.x", "mmcv": [ "2.0.0rc3", "2.0.0rc2" ] }, { "cuda": "cpu", "torch": "2.4.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "cpu", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "cpu", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "cpu", "torch": "1.5.x", "mmcv": [ "2.0.0rc3", "2.0.0rc2" ] } ], "Windows": [ { "cuda": "12.1", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "11.8", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "11.8", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "11.7", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "11.7", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "11.6", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "11.6", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.5", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3" ] }, { "cuda": "10.2", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.1", "torch": "1.8.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.1", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3" ] }, { "cuda": "10.1", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "cpu", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "cpu", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "cpu", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] } ], "macOS": [ { "cuda": "cpu", "torch": "2.1.x", "mmcv": [ "2.1.0" ] }, { "cuda": "cpu", "torch": "2.0.x", "mmcv": [ "2.1.0", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.13.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "mps", "torch": "1.13.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3" ] }, { "cuda": "cpu", "torch": "1.12.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.11.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.10.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.9.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.8.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2" ] }, { "cuda": "cpu", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2" ] } ] } ================================================ FILE: docs/en/_templates/classtemplate.rst ================================================ .. role:: hidden :class: hidden-section .. currentmodule:: {{ module }} {{ name | underline}} .. autoclass:: {{ name }} :members: .. autogenerated from source/_templates/classtemplate.rst note it does not have :inherited-members: ================================================ FILE: docs/en/api/arraymisc.rst ================================================ .. role:: hidden :class: hidden-section mmcv.arraymisc =================================== .. contents:: mmcv.arraymisc :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.arraymisc .. autosummary:: :toctree: generated :nosignatures: quantize dequantize ================================================ FILE: docs/en/api/cnn.rst ================================================ .. role:: hidden :class: hidden-section mmcv.cnn =================================== .. contents:: mmcv.cnn :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.cnn Module ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst ContextBlock Conv2d Conv3d ConvAWS2d ConvModule ConvTranspose2d ConvTranspose3d ConvWS2d DepthwiseSeparableConvModule GeneralizedAttention HSigmoid HSwish LayerScale Linear MaxPool2d MaxPool3d NonLocal1d NonLocal2d NonLocal3d Scale Swish Conv2dRFSearchOp Build Function ---------------- .. autosummary:: :toctree: generated :nosignatures: build_activation_layer build_conv_layer build_norm_layer build_padding_layer build_plugin_layer build_upsample_layer Miscellaneous ---------------- .. autosummary:: :toctree: generated :nosignatures: fuse_conv_bn conv_ws_2d is_norm make_res_layer make_vgg_layer get_model_complexity_info ================================================ FILE: docs/en/api/image.rst ================================================ .. role:: hidden :class: hidden-section mmcv.image =================================== .. contents:: mmcv.image :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.image IO ---------------- .. autosummary:: :toctree: generated :nosignatures: imfrombytes imread imwrite use_backend Color Space ---------------- .. autosummary:: :toctree: generated :nosignatures: bgr2gray bgr2hls bgr2hsv bgr2rgb bgr2ycbcr gray2bgr gray2rgb hls2bgr hsv2bgr imconvert rgb2bgr rgb2gray rgb2ycbcr ycbcr2bgr ycbcr2rgb Geometric ---------------- .. autosummary:: :toctree: generated :nosignatures: cutout imcrop imflip impad impad_to_multiple imrescale imresize imresize_like imresize_to_multiple imrotate imshear imtranslate rescale_size Photometric ---------------- .. autosummary:: :toctree: generated :nosignatures: adjust_brightness adjust_color adjust_contrast adjust_hue adjust_lighting adjust_sharpness auto_contrast clahe imdenormalize imequalize iminvert imnormalize lut_transform posterize solarize Miscellaneous ---------------- .. autosummary:: :toctree: generated :nosignatures: tensor2imgs ================================================ FILE: docs/en/api/ops.rst ================================================ .. role:: hidden :class: hidden-section mmcv.ops =================================== .. contents:: mmcv.ops :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.ops .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst BorderAlign CARAFE CARAFENaive CARAFEPack Conv2d ConvTranspose2d CornerPool Correlation CrissCrossAttention DeformConv2d DeformConv2dPack DeformRoIPool DeformRoIPoolPack DynamicScatter FusedBiasLeakyReLU GroupAll Linear MaskedConv2d MaxPool2d ModulatedDeformConv2d ModulatedDeformConv2dPack ModulatedDeformRoIPoolPack MultiScaleDeformableAttention PSAMask PointsSampler PrRoIPool QueryAndGroup RiRoIAlignRotated RoIAlign RoIAlignRotated RoIAwarePool3d RoIPointPool3d RoIPool SAConv2d SigmoidFocalLoss SimpleRoIAlign SoftmaxFocalLoss SparseConv2d SparseConv3d SparseConvTensor SparseConvTranspose2d SparseConvTranspose3d SparseInverseConv2d SparseInverseConv3d SparseMaxPool2d SparseMaxPool3d SparseModule SparseSequential SubMConv2d SubMConv3d SyncBatchNorm TINShift Voxelization .. autosummary:: :toctree: generated :nosignatures: active_rotated_filter assign_score_withk ball_query batched_nms bbox_overlaps border_align box_iou_rotated boxes_iou3d boxes_iou_bev boxes_overlap_bev carafe carafe_naive chamfer_distance contour_expand convex_giou convex_iou deform_conv2d deform_roi_pool diff_iou_rotated_2d diff_iou_rotated_3d dynamic_scatter furthest_point_sample furthest_point_sample_with_dist fused_bias_leakyrelu gather_points grouping_operation knn masked_conv2d min_area_polygons modulated_deform_conv2d nms nms3d nms3d_normal nms_bev nms_match nms_normal_bev nms_rotated pixel_group point_sample points_in_boxes_all points_in_boxes_cpu points_in_boxes_part points_in_polygons prroi_pool rel_roi_point_to_rel_img_point riroi_align_rotated roi_align roi_align_rotated roi_pool rotated_feature_align scatter_nd sigmoid_focal_loss soft_nms softmax_focal_loss three_interpolate three_nn tin_shift upfirdn2d voxelization ================================================ FILE: docs/en/api/transforms.rst ================================================ .. role:: hidden :class: hidden-section mmcv.transforms =================================== .. currentmodule:: mmcv.transforms .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst BaseTransform TestTimeAug Loading ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst LoadAnnotations LoadImageFromFile Processing ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst CenterCrop MultiScaleFlipAug Normalize Pad RandomChoiceResize RandomFlip RandomGrayscale RandomResize Resize ToTensor ImageToTensor Wrapper ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst Compose KeyMapper RandomApply RandomChoice TransformBroadcaster ================================================ FILE: docs/en/api/utils.rst ================================================ .. role:: hidden :class: hidden-section mmcv.utils =================================== .. contents:: mmcv.utils :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.utils .. autosummary:: :toctree: generated :nosignatures: IS_CUDA_AVAILABLE IS_MLU_AVAILABLE IS_MPS_AVAILABLE collect_env jit skip_no_elena ================================================ FILE: docs/en/api/video.rst ================================================ .. role:: hidden :class: hidden-section mmcv.video =================================== .. contents:: mmcv.video :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.video IO ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst VideoReader Cache .. autosummary:: :toctree: generated :nosignatures: frames2video Optical Flow ---------------- .. autosummary:: :toctree: generated :nosignatures: dequantize_flow flow_from_bytes flow_warp flowread flowwrite quantize_flow sparse_flow_from_bytes Video Processing ---------------- .. autosummary:: :toctree: generated :nosignatures: concat_video convert_video cut_video resize_video ================================================ FILE: docs/en/api/visualization.rst ================================================ .. role:: hidden :class: hidden-section mmcv.visualization =================================== .. contents:: mmcv.visualization :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.visualization Color ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst Color .. autosummary:: :toctree: generated :nosignatures: color_val Image ---------------- .. autosummary:: :toctree: generated :nosignatures: imshow imshow_bboxes imshow_det_bboxes Optical Flow ---------------- .. autosummary:: :toctree: generated :nosignatures: flow2rgb flowshow make_color_wheel ================================================ FILE: docs/en/community/contributing.md ================================================ ## Contributing to OpenMMLab Welcome to the MMCV community, we are committed to building a cutting-edge computer vision foundational library and all kinds of contributions are welcomed, including but not limited to **Fix bug** You can directly post a Pull Request to fix typo in code or documents The steps to fix the bug of code implementation are as follows. 1. If the modification involve significant changes, you should create an issue first and describe the error information and how to trigger the bug. Other developers will discuss with you and propose an proper solution. 2. Posting a pull request after fixing the bug and adding corresponding unit test. **New Feature or Enhancement** 1. If the modification involve significant changes, you should create an issue to discuss with our developers to propose an proper design. 2. Post a Pull Request after implementing the new feature or enhancement and add corresponding unit test. **Document** You can directly post a pull request to fix documents. If you want to add a document, you should first create an issue to check if it is reasonable. ### Pull Request Workflow If you're not familiar with Pull Request, don't worry! The following guidance will tell you how to create a Pull Request step by step. If you want to dive into the develop mode of Pull Request, you can refer to the [official documents](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) #### 1. Fork and clone If you are posting a pull request for the first time, you should fork the OpenMMLab repositories by clicking the **Fork** button in the top right corner of the GitHub page, and the forked repositories will appear under your GitHub profile. Then, you can clone the repositories to local: ```shell git clone git@github.com:{username}/mmcv.git ``` After that, you should ddd official repository as the upstream repository ```bash git remote add upstream git@github.com:open-mmlab/mmcv ``` Check whether remote repository has been added successfully by `git remote -v` ```bash origin git@github.com:{username}/mmcv.git (fetch) origin git@github.com:{username}/mmcv.git (push) upstream git@github.com:open-mmlab/mmcv (fetch) upstream git@github.com:open-mmlab/mmcv (push) ``` ```{note} Here's a brief introduction to origin and upstream. When we use "git clone", we create an "origin" remote by default, which points to the repository cloned from. As for "upstream", we add it ourselves to point to the target repository. Of course, if you don't like the name "upstream", you could name it as you wish. Usually, we'll push the code to "origin". If the pushed code conflicts with the latest code in official("upstream"), we should pull the latest code from upstream to resolve the conflicts, and then push to "origin" again. The posted Pull Request will be updated automatically. ``` #### 2. Configure pre-commit You should configure [pre-commit](https://pre-commit.com/#intro) in the local development environment to make sure the code style matches that of OpenMMLab. **Note**: The following code should be executed under the MMCV directory. ```shell pip install -U pre-commit pre-commit install ``` Check that pre-commit is configured successfully, and install the hooks defined in `.pre-commit-config.yaml`. ```shell pre-commit run --all-files ``` ```{note} Chinese users may fail to download the pre-commit hooks due to the network issue. In this case, you could download these hooks from gitee by setting the .pre-commit-config-zh-cn.yaml pre-commit install -c .pre-commit-config-zh-cn.yaml pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml ``` If the installation process is interrupted, you can repeatedly run `pre-commit run ... ` to continue the installation. If the code does not conform to the code style specification, pre-commit will raise a warning and fixes some of the errors automatically. If we want to commit our code bypassing the pre-commit hook, we can use the `--no-verify` option(**only for temporarily commit**. ```shell git commit -m "xxx" --no-verify ``` #### 3. Create a development branch After configuring the pre-commit, we should create a branch based on the main branch to develop the new feature or fix the bug. The proposed branch name is `username/pr_name` ```shell git checkout -b yhc/refactor_contributing_doc ``` In subsequent development, if the main branch of the local repository is behind the main branch of "upstream", we need to pull the upstream for synchronization, and then execute the above command: ```shell git pull upstream main ``` #### 4. Commit the code and pass the unit test - MMCV introduces mypy to do static type checking to increase the robustness of the code. Therefore, we need to add Type Hints to our code and pass the mypy check. If you are not familiar with Type Hints, you can refer to [this tutorial](https://docs.python.org/3/library/typing.html). - The committed code should pass through the unit test ```shell # Pass all unit tests pytest tests # Pass the unit test of runner pytest tests/test_runner/test_runner.py ``` If the unit test fails for lack of dependencies, you can install the dependencies referring to the [guidance](#unit-test) - If the documents are modified/added, we should check the rendering result referring to [guidance](#document-rendering) #### 5. Push the code to remote We could push the local commits to remote after passing through the check of unit test and pre-commit. You can associate the local branch with remote branch by adding `-u` option. ```shell git push -u origin {branch_name} ``` This will allow you to use the `git push` command to push code directly next time, without having to specify a branch or the remote repository. #### 6. Create a Pull Request (1) Create a pull request in GitHub's Pull request interface (2) Modify the PR description according to the guidelines so that other developers can better understand your changes Find more details about Pull Request description in [pull request guidelines](#pr-specs). **note** (a) The Pull Request description should contain the reason for the change, the content of the change, and the impact of the change, and be associated with the relevant Issue (see [documentation](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) (b) If it is your first contribution, please sign the CLA (c) Check whether the Pull Request pass through the CI MMCV will run unit test for the posted Pull Request on different platforms (Linux, Window, Mac), based on different versions of Python, PyTorch, CUDA to make sure the code is correct. We can see the specific test information by clicking `Details` in the above image so that we can modify the code. (3) If the Pull Request passes the CI, then you can wait for the review from other developers. You'll modify the code based on the reviewer's comments, and repeat the steps [4](#4-commit-the-code-and-pass-the-unit-test)-[5](#5-push-the-code-to-remote) until all reviewers approve it. Then, we will merge it ASAP. #### 7. Resolve conflicts If your local branch conflicts with the latest main branch of "upstream", you'll need to resolove them. There are two ways to do this: ```shell git fetch --all --prune git rebase upstream/main ``` or ```shell git fetch --all --prune git merge upstream/main ``` If you are very good at handling conflicts, then you can use rebase to resolve conflicts, as this will keep your commit logs tidy. If you are not familiar with `rebase`, then you can use `merge` to resolve conflicts. ### Guidance #### Unit test If you cannot run the unit test of some modules for lacking of some dependencies, such as [video](https://github.com/open-mmlab/mmcv/tree/main/mmcv/video) module, you can try to install the following dependencies: ```shell # Linux sudo apt-get update -y sudo apt-get install -y libturbojpeg sudo apt-get install -y ffmpeg # Windows conda install ffmpeg ``` We should also make sure the committed code will not decrease the coverage of unit test, we could run the following command to check the coverage of unit test: ```shell python -m coverage run -m pytest /path/to/test_file python -m coverage html # check file in htmlcov/index.html ``` #### Document rendering If the documents are modified/added, we should check the rendering result. We could install the dependencies and run the following command to render the documents and check the results: ```shell pip install -r requirements/docs.txt cd docs/zh_cn/ # or docs/en make html # check file in ./docs/zh_cn/_build/html/index.html ``` ### Code style #### Python We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style. We use the following tools for linting and formatting: - [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools. - [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports. - [yapf](https://github.com/google/yapf): A formatter for Python files. - [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files. - [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files. - [docformatter](https://github.com/myint/docformatter): A formatter to format docstring. Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg). We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit. The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml). #### C++ and CUDA We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html). ### PR Specs 1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style 2. One short-time branch should be matched with only one PR 3. Accomplish a detailed change in one PR. Avoid large PR - Bad: Support Faster R-CNN - Acceptable: Add a box head to Faster R-CNN - Good: Add a parameter to box head to support custom conv-layer number 4. Provide clear and significant commit message 5. Provide clear and meaningful PR description - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix) - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily) - Introduce main changes, results and influences on other modules in short description - Associate related issues and pull requests with a milestone ================================================ FILE: docs/en/community/pr.md ================================================ ## Pull Request (PR) Content has been migrated to [contributing guidance](contributing.md). ================================================ FILE: docs/en/compatibility.md ================================================ ### v2.0.0 The OpenMMLab team released a new generation of training engine [MMEngine](https://github.com/open-mmlab/mmengine) at the World Artificial Intelligence Conference on September 1, 2022. It is a foundational library for training deep learning models. Compared with MMCV, it provides a universal and powerful runner, an open architecture with a more unified interface, and a more customizable training process. The OpenMMLab team released MMCV v2.0.0 on April 6, 2023. In the 2.x version, it has the following significant changes: (1) It removed the following components: - `mmcv.fileio` module, removed in PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179). FileIO module from mmengine will be used wherever required. - `mmcv.runner`, `mmcv.parallel`, `mmcv. engine` and `mmcv.device`, removed in PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216). - All classes in `mmcv.utils` (eg `Config` and `Registry`) and many functions, removed in PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217). Only a few functions related to mmcv are reserved. - `mmcv.onnx`, `mmcv.tensorrt` modules and related functions, removed in PR [#2225](https://github.com/open-mmlab/mmcv/pull/2225). - Removed all root registrars in MMCV and registered classes or functions to the [root registrar](https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py) in MMEngine. (2) It added the [`mmcv.transforms`](https://github.com/open-mmlab/mmcv/tree/main/mmcv/transforms) data transformation module. (3) It renamed the package name **mmcv** to **mmcv-lite** and **mmcv-full** to **mmcv** in PR [#2235](https://github.com/open-mmlab/mmcv/pull/2235). Also, change the default value of the environment variable `MMCV_WITH_OPS` from 0 to 1.
MMCV < 2.0 MMCV >= 2.0
```bash # Contains ops, because the highest version of mmcv-full is less than 2.0.0, so there is no need to add version restrictions pip install openmim mim install mmcv-full # do not contain ops pip install openmim mim install "mmcv < 2.0.0" ``` ```bash # Contains ops pip install openmim mim install mmcv # Ops are not included, because the starting version of mmcv-lite is 2.0.0rc1, so there is no need to add version restrictions pip install openmim mim install mmcv-lite ```
### v1.3.18 Some ops have different implementations on different devices. Lots of macros and type checks are scattered in several files, which makes the code hard to maintain. For example: ```c++ if (input.device().is_cuda()) { #ifdef MMCV_WITH_CUDA CHECK_CUDA_INPUT(input); CHECK_CUDA_INPUT(rois); CHECK_CUDA_INPUT(output); CHECK_CUDA_INPUT(argmax_y); CHECK_CUDA_INPUT(argmax_x); roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); #else AT_ERROR("RoIAlign is not compiled with GPU support"); #endif } else { CHECK_CPU_INPUT(input); CHECK_CPU_INPUT(rois); CHECK_CPU_INPUT(output); CHECK_CPU_INPUT(argmax_y); CHECK_CPU_INPUT(argmax_x); roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } ``` Registry and dispatcher are added to manage these implementations. ```c++ void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned); void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { ROIAlignForwardCUDAKernelLauncher( input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } // register cuda implementation void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned); REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda); // roi_align.cpp // use the dispatcher to invoke different implementation depending on device type of input tensors. void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } ``` ### v1.3.11 In order to flexibly support more backends and hardwares like `NVIDIA GPUs` and `AMD GPUs`, the directory of `mmcv/ops/csrc` is refactored. Note that this refactoring will not affect the usage in API. For related information, please refer to [PR1206](https://github.com/open-mmlab/mmcv/pull/1206). The original directory was organized as follows. ``` . ├── common_cuda_helper.hpp ├── ops_cuda_kernel.cuh ├── pytorch_cpp_helper.hpp ├── pytorch_cuda_helper.hpp ├── parrots_cpp_helper.hpp ├── parrots_cuda_helper.hpp ├── parrots_cudawarpfunction.cuh ├── onnxruntime │   ├── onnxruntime_register.h │   ├── onnxruntime_session_options_config_keys.h │   ├── ort_mmcv_utils.h │   ├── ... │   ├── onnx_ops.h │   └── cpu │ ├── onnxruntime_register.cpp │      ├── ... │      └── onnx_ops_impl.cpp ├── parrots │   ├── ... │   ├── ops.cpp │   ├── ops_cuda.cu │   ├── ops_parrots.cpp │   └── ops_pytorch.h ├── pytorch │   ├── ... │   ├── ops.cpp │   ├── ops_cuda.cu │   ├── pybind.cpp └── tensorrt ├── trt_cuda_helper.cuh ├── trt_plugin_helper.hpp ├── trt_plugin.hpp ├── trt_serialize.hpp ├── ... ├── trt_ops.hpp └── plugins    ├── trt_cuda_helper.cu    ├── trt_plugin.cpp    ├── ...    ├── trt_ops.cpp    └── trt_ops_kernel.cu ``` After refactored, it is organized as follows. ``` . ├── common │ ├── box_iou_rotated_utils.hpp │ ├── parrots_cpp_helper.hpp │ ├── parrots_cuda_helper.hpp │ ├── pytorch_cpp_helper.hpp │ ├── pytorch_cuda_helper.hpp │   └── cuda │   ├── common_cuda_helper.hpp │   ├── parrots_cudawarpfunction.cuh │   ├── ... │   └── ops_cuda_kernel.cuh ├── onnxruntime │   ├── onnxruntime_register.h │   ├── onnxruntime_session_options_config_keys.h │   ├── ort_mmcv_utils.h │   ├── ... │   ├── onnx_ops.h │   └── cpu │ ├── onnxruntime_register.cpp │      ├── ... │      └── onnx_ops_impl.cpp ├── parrots │   ├── ... │   ├── ops.cpp │   ├── ops_parrots.cpp │   └── ops_pytorch.h ├── pytorch │   ├── info.cpp │   ├── pybind.cpp │   ├── ... │   ├── ops.cpp │   └── cuda │      ├── ... │      └── ops_cuda.cu └── tensorrt ├── trt_cuda_helper.cuh ├── trt_plugin_helper.hpp ├── trt_plugin.hpp ├── trt_serialize.hpp ├── ... ├── trt_ops.hpp └── plugins    ├── trt_cuda_helper.cu    ├── trt_plugin.cpp    ├── ...    ├── trt_ops.cpp    └── trt_ops_kernel.cu ``` ================================================ FILE: docs/en/conf.py ================================================ # # Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/master/config # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys import pytorch_sphinx_theme from sphinx.builders.html import StandaloneHTMLBuilder sys.path.insert(0, os.path.abspath('../..')) version_file = '../../mmcv/version.py' with open(version_file) as f: exec(compile(f.read(), version_file, 'exec')) __version__ = locals()['__version__'] # -- Project information ----------------------------------------------------- project = 'mmcv' copyright = '2018-2022, OpenMMLab' author = 'MMCV Authors' # The short X.Y version version = __version__ # The full version, including alpha/beta/rc tags release = __version__ # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', 'sphinx_markdown_tables', 'myst_parser', 'sphinx_copybutton', ] # yapf: disable myst_heading_anchors = 4 myst_enable_extensions = ['colon_fence'] # Configuration for intersphinx intersphinx_mapping = { 'python': ('https://docs.python.org/3', None), 'numpy': ('https://numpy.org/doc/stable', None), 'torch': ('https://pytorch.org/docs/stable/', None), 'mmengine': ('https://mmengine.readthedocs.io/en/latest', None), } autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision'] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = { '.rst': 'restructuredtext', '.md': 'markdown', } # The master toctree document. master_doc = 'index' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # # html_theme = 'sphinx_rtd_theme' html_theme = 'pytorch_sphinx_theme' html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = { 'menu': [ { 'name': 'GitHub', 'url': 'https://github.com/open-mmlab/mmcv' }, ], # Specify the language of shared menu 'menu_lang': 'en', } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] html_css_files = ['css/readthedocs.css'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = 'mmcvdoc' # -- Options for LaTeX output ------------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'mmcv.tex', 'mmcv Documentation', 'MMCV Contributors', 'manual'), ] # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, 'mmcv', 'mmcv Documentation', [author], 1)] # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'mmcv', 'mmcv Documentation', author, 'mmcv', 'One line description of project.', 'Miscellaneous'), ] # -- Options for Epub output ------------------------------------------------- # Bibliographic Dublin Core info. epub_title = project # The unique identifier of the text. This can be a ISBN number # or the project homepage. # # epub_identifier = '' # A unique identification for the text. # # epub_uid = '' # A list of files that should not be packed into the epub file. epub_exclude_files = ['search.html'] # set priority when building html StandaloneHTMLBuilder.supported_image_types = [ 'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg' ] # -- Extension configuration ------------------------------------------------- # Ignore >>> when copying code copybutton_prompt_text = r'>>> |\.\.\. ' copybutton_prompt_is_regexp = True ================================================ FILE: docs/en/deployment/mmcv_ops_definition.md ================================================ # MMCV Operators To make custom operators in MMCV more standard, precise definitions of each operator are listed in this document. - [MMCV Operators](#mmcv-operators) - [MMCVBorderAlign](#mmcvborderalign) - [Description](#description) - [Parameters](#parameters) - [Inputs](#inputs) - [Outputs](#outputs) - [Type Constraints](#type-constraints) - [MMCVCARAFE](#mmcvcarafe) - [Description](#description-1) - [Parameters](#parameters-1) - [Inputs](#inputs-1) - [Outputs](#outputs-1) - [Type Constraints](#type-constraints-1) - [MMCVCAWeight](#mmcvcaweight) - [Description](#description-2) - [Parameters](#parameters-2) - [Inputs](#inputs-2) - [Outputs](#outputs-2) - [Type Constraints](#type-constraints-2) - [MMCVCAMap](#mmcvcamap) - [Description](#description-3) - [Parameters](#parameters-3) - [Inputs](#inputs-3) - [Outputs](#outputs-3) - [Type Constraints](#type-constraints-3) - [MMCVCornerPool](#mmcvcornerpool) - [Description](#description-4) - [Parameters](#parameters-4) - [Inputs](#inputs-4) - [Outputs](#outputs-4) - [Type Constraints](#type-constraints-4) - [MMCVDeformConv2d](#mmcvdeformconv2d) - [Description](#description-5) - [Parameters](#parameters-5) - [Inputs](#inputs-5) - [Outputs](#outputs-5) - [Type Constraints](#type-constraints-5) - [MMCVModulatedDeformConv2d](#mmcvmodulateddeformconv2d) - [Description](#description-6) - [Parameters](#parameters-6) - [Inputs](#inputs-6) - [Outputs](#outputs-6) - [Type Constraints](#type-constraints-6) - [MMCVDeformRoIPool](#mmcvdeformroipool) - [Description](#description-7) - [Parameters](#parameters-7) - [Inputs](#inputs-7) - [Outputs](#outputs-7) - [Type Constraints](#type-constraints-7) - [MMCVMaskedConv2d](#mmcvmaskedconv2d) - [Description](#description-8) - [Parameters](#parameters-8) - [Inputs](#inputs-8) - [Outputs](#outputs-8) - [Type Constraints](#type-constraints-8) - [MMCVPSAMask](#mmcvpsamask) - [Description](#description-9) - [Parameters](#parameters-9) - [Inputs](#inputs-9) - [Outputs](#outputs-9) - [Type Constraints](#type-constraints-9) - [NonMaxSuppression](#nonmaxsuppression) - [Description](#description-10) - [Parameters](#parameters-10) - [Inputs](#inputs-10) - [Outputs](#outputs-10) - [Type Constraints](#type-constraints-10) - [MMCVRoIAlign](#mmcvroialign) - [Description](#description-11) - [Parameters](#parameters-11) - [Inputs](#inputs-11) - [Outputs](#outputs-11) - [Type Constraints](#type-constraints-11) - [MMCVRoIAlignRotated](#mmcvroialignrotated) - [Description](#description-12) - [Parameters](#parameters-12) - [Inputs](#inputs-12) - [Outputs](#outputs-12) - [Type Constraints](#type-constraints-12) - [grid_sampler\*](#grid_sampler) - [Description](#description-13) - [Parameters](#parameters-13) - [Inputs](#inputs-13) - [Outputs](#outputs-13) - [Type Constraints](#type-constraints-13) - [cummax\*](#cummax) - [Description](#description-14) - [Parameters](#parameters-14) - [Inputs](#inputs-14) - [Outputs](#outputs-14) - [Type Constraints](#type-constraints-14) - [cummin\*](#cummin) - [Description](#description-15) - [Parameters](#parameters-15) - [Inputs](#inputs-15) - [Outputs](#outputs-15) - [Type Constraints](#type-constraints-15) - [Reminders](#reminders) ## MMCVBorderAlign ### Description Applies `border_align` over the input feature based on predicted bboxes. For each border line (e.g. top, left, bottom or right) of each box, border_align does the following: - uniformly samples `pool_size`+1 positions on this line, involving the start and end points. - the corresponding features on these points are computed by bilinear interpolation. - max pooling over all the `pool_size`+1 positions are used for computing pooled feature. Read [BorderDet: Border Feature for Dense Object Detection](ttps://arxiv.org/abs/2007.11056) for more detailed information. ### Parameters | Type | Parameter | Description | | ----- | ----------- | ----------------------------------------------------------------------------------- | | `int` | `pool_size` | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). | ### Inputs
input: T
Features with shape [N,4C,H,W]. Channels ranged in [0,C), [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, right features respectively
boxes: T
Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
### Outputs
output: T
Pooled features with shape [N,C,H*W,4]. The order is(top,left,bottom,right) for the last dimension.
### Type Constraints - T:tensor(float32) ## MMCVCARAFE ### Description CARAFE operator performs feature upsampling. Read [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.02188) for more detailed information. ### Parameters | Type | Parameter | Description | | ------- | -------------- | --------------------------------------------- | | `int` | `kernel_size` | reassemble kernel size, should be odd integer | | `int` | `group_size` | reassemble group size | | `float` | `scale_factor` | upsample ratio(>=1) | ### Inputs
features: T
Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.
masks: T
The input mask
### Outputs
output: T
The upsampled features. 4-D tensor of shape (N, C, H * scale_factor, W * scale_factor). N is the batch size.
### Type Constraints - T:tensor(float32) ## MMCVCAWeight ### Description Operator for Criss-Cross Attention Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/pdf/1811.11721.pdf) for more detailed information. ### Parameters None ### Inputs
t: T
The query matrix of shape (N, C', H, W).
f: T
The key matrix of shape (N, C', H, W).
### Outputs
weight: T
The attention map of shape (N, H+W-1, H, W).
### Type Constraints - T:tensor(float32) ## MMCVCAMap ### Description Operator for Criss-Cross Attention Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/pdf/1811.11721.pdf) for more detailed information. ### Parameters None ### Inputs
weight: T
Output from the operator MMCVCAWeight.
value: T
The value matrix of shape (N, C, H, W).
### Outputs
output: T
Output tensor of aggregated contextual information
### Type Constraints - T:tensor(float32) ## MMCVCornerPool ### Description Perform CornerPool on `input` features. Read [CornerNet -- Detecting Objects as Paired Keypoints](https://arxiv.org/abs/1808.01244) for more details. ### Parameters | Type | Parameter | Description | | ----- | --------- | ---------------------------------------------------------------- | | `int` | `mode` | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) | ### Inputs
input: T
Input features. 4-D tensor of shape (N, C, H, W). N is the batch size.
### Outputs
output: T
The pooled features. 4-D tensor of shape (N, C, H, W).
### Type Constraints - T:tensor(float32) ## MMCVDeformConv2d ### Description Applies a deformable 2D convolution over an input signal composed of several input planes. Read [Deformable Convolutional Networks](https://arxiv.org/pdf/1703.06211.pdf) for detail. ### Parameters | Type | Parameter | Description | | -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------- | | `list of ints` | `stride` | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`. | | `list of ints` | `padding` | Paddings on both sides of the input, (padH, padW). Defaults to `(0, 0)`. | | `list of ints` | `dilation` | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`. | | `int` | `groups` | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`. | | `int` | `deformable_groups` | Groups of deformable offset. Defaults to `1`. | | `int` | `bias` | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`. | | `int` | `im2col_step` | Groups of deformable offset. Defaults to `32`. | ### Inputs
input: T
Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.
offset: T
Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW is the height and width of offset and output.
weight: T
Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).
### Outputs
output: T
Output feature; 4-D tensor of shape (N, output_channel, outH, outW).
### Type Constraints - T:tensor(float32, Linear) ## MMCVModulatedDeformConv2d ### Description Perform Modulated Deformable Convolution on input feature, read [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/abs/1811.11168?from=timeline) for detail. ### Parameters | Type | Parameter | Description | | -------------- | ------------------- | ------------------------------------------------------------------------------------- | | `list of ints` | `stride` | The stride of the convolving kernel. (sH, sW) | | `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW) | | `list of ints` | `dilation` | The spacing between kernel elements. (dH, dW) | | `int` | `deformable_groups` | Groups of deformable offset. | | `int` | `groups` | Split input into groups. `input_channel` should be divisible by the number of groups. | ### Inputs
feature: T
Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the number of channels, inH and inW are the height and width of the data.
offset: T
Input offset; 4-D tensor of shape (N, deformable_group* 2* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW are the height and width of offset and output.
mask: T
Input mask; 4-D tensor of shape (N, deformable_group* kH* kW, outH, outW), where kH and kW are the height and width of weight, outH and outW are the height and width of offset and output.
weight]: T
Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).
bias: T, optional
Input bias; 1-D tensor of shape (output_channel).
### Outputs
output: T
Output feature; 4-D tensor of shape (N, output_channel, outH, outW).
### Type Constraints - T:tensor(float32, Linear) ## MMCVDeformRoIPool ### Description Deformable roi pooling layer ### Parameters | Type | Parameter | Description | | ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- | | `int` | `output_height` | height of output roi | | `int` | `output_width` | width of output roi | | `float` | `spatial_scale` | used to scale the input boxes | | `int` | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. | | `float` | `gamma` | gamma | ### Inputs
input: T
Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.
rois: T
RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.
offset: T
offset of height and width. Defaults to a tensor of zero
### Outputs
feat: T
RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].
### Type Constraints - T:tensor(float32) ## MMCVMaskedConv2d ### Description Performs a masked 2D convolution from PixelRNN Read [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759) for more detailed information. ### Parameters | Type | Parameter | Description | | -------------- | --------- | -------------------------------------------------------------------------------- | | `list of ints` | `stride` | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv** | | `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`. | ### Inputs
features: T
Input features; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.
mask: T
Input mask; 3D tensor of shape (N, H, W)
weight: T
The learnable weights of the module
bias: T
The learnable bias of the module
### Outputs
output: T
The output convolved feature
### Type Constraints - T:tensor(float32) ## MMCVPSAMask ### Description An operator from PSANet. Read [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://hszhao.github.io/papers/eccv18_psanet.pdf) for more detailed information. ### Parameters | Type | Parameter | Description | | -------------- | ----------- | -------------------------------------------- | | `int` | `psa_type` | `0` means collect and `1` means `distribute` | | `list of ints` | `mask_size` | The size of mask | ### Inputs
input: T
Input feature; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.
### Outputs
output: T
Output tensor of shape (N, H * W, H, W)
### Type Constraints - T:tensor(float32) ## NonMaxSuppression ### Description Filter out boxes has high IoU overlap with previously selected boxes or low score. Output the indices of valid boxes. Note this definition is slightly different with [onnx: NonMaxSuppression](https://github.com/onnx/onnx/blob/main/docs/Operators.md#nonmaxsuppression) ### Parameters | Type | Parameter | Description | | ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | | `int` | `center_point_box` | 0 - the box data is supplied as \[y1, x1, y2, x2\], 1-the box data is supplied as \[x_center, y_center, width, height\]. | | `int` | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. | | `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0. | | `float` | `score_threshold` | The threshold for deciding when to remove boxes based on score. | | `int` | `offset` | 0 or 1, boxes' width or height is (x2 - x1 + offset). | ### Inputs
boxes: T
Input boxes. 3-D tensor of shape (num_batches, spatial_dimension, 4).
scores: T
Input scores. 3-D tensor of shape (num_batches, num_classes, spatial_dimension).
### Outputs
indices: tensor(int32, Linear)
Selected indices. 2-D tensor of shape (num_selected_indices, 3) as [[batch_index, class_index, box_index], ...].
num_selected_indices=num_batches* num_classes* min(max_output_boxes_per_class, spatial_dimension).
All invalid indices will be filled with -1.
### Type Constraints - T:tensor(float32, Linear) ## MMCVRoIAlign ### Description Perform RoIAlign on output feature, used in bbox_head of most two-stage detectors. ### Parameters | Type | Parameter | Description | | ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- | | `int` | `output_height` | height of output roi | | `int` | `output_width` | width of output roi | | `float` | `spatial_scale` | used to scale the input boxes | | `int` | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. | | `str` | `mode` | pooling mode in each bin. `avg` or `max` | | `int` | `aligned` | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly. | ### Inputs
input: T
Input feature map; 4D tensor of shape (N, C, H, W), where N is the batch size, C is the numbers of channels, H and W are the height and width of the data.
rois: T
RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.
### Outputs
feat: T
RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].
### Type Constraints - T:tensor(float32) ## MMCVRoIAlignRotated ### Description Perform RoI align pooling for rotated proposals ### Parameters | Type | Parameter | Description | | ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- | | `int` | `output_height` | height of output roi | | `int` | `output_width` | width of output roi | | `float` | `spatial_scale` | used to scale the input boxes | | `int` | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. | | `str` | `mode` | pooling mode in each bin. `avg` or `max` | | `int` | `aligned` | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly. | | `int` | `clockwise` | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly. | ### Inputs
features: T
Input feature map; 4D tensor of shape (N, C, H, W)
rois: T
RoIs (Regions of Interest) to pool over; 2-D tensor of shape (num_rois, 5) given as [[batch_index, x1, y1, x2, y2], ...]. The RoIs' coordinates are the coordinate system of input.
### Outputs
RoI pooled output, 4-D tensor of shape (num_rois, C, output_height, output_width). The r-th batch element feat[r-1] is a pooled feature map corresponding to the r-th RoI RoIs[r-1].
### Type Constraints - T:tensor(float32) ## grid_sampler\* ### Description Perform sample from `input` with pixel locations from `grid`. Check [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html?highlight=grid_sample#torch.nn.functional.grid_sample) for more information. ### Parameters | Type | Parameter | Description | | ----- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `int` | `interpolation_mode` | Interpolation mode to calculate output values. (0: `bilinear` , 1: `nearest`) | | `int` | `padding_mode` | Padding mode for outside grid values. (0: `zeros`, 1: `border`, 2: `reflection`) | | `int` | `align_corners` | If `align_corners=1`, the extrema (`-1` and `1`) are considered as referring to the center points of the input's corner pixels. If `align_corners=0`, they are instead considered as referring to the corner points of the input's corner pixels, making the sampling more resolution agnostic. | ### Inputs
input: T
Input feature; 4-D tensor of shape (N, C, inH, inW), where N is the batch size, C is the numbers of channels, inH and inW are the height and width of the data.
grid: T
Input offset; 4-D tensor of shape (N, outH, outW, 2), where outH and outW are the height and width of offset and output.
### Outputs
output: T
Output feature; 4-D tensor of shape (N, C, outH, outW).
### Type Constraints - T:tensor(float32, Linear) ## cummax\* ### Description Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum elements of `input` in the dimension `dim`. And `indices` is the index location of each maximum value found in the dimension `dim`. Read [torch.cummax](https://pytorch.org/docs/stable/generated/torch.cummax.html) for more details. ### Parameters | Type | Parameter | Description | | ----- | --------- | -------------------------------------- | | `int` | `dim` | the dimension to do the operation over | ### Inputs
input: T
The input tensor with various shapes. Tensor with empty element is also supported.
### Outputs
output: T
Output the cumulative maximum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.
indices: tensor(int64)
Output the index location of each cumulative maximum value found in the dimension `dim`, with the same shape as `input`.
### Type Constraints - T:tensor(float32) ## cummin\* ### Description Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum elements of `input` in the dimension `dim`. And `indices` is the index location of each minimum value found in the dimension `dim`. Read [torch.cummin](https://pytorch.org/docs/stable/generated/torch.cummin.html) for more details. ### Parameters | Type | Parameter | Description | | ----- | --------- | -------------------------------------- | | `int` | `dim` | the dimension to do the operation over | ### Inputs
input: T
The input tensor with various shapes. Tensor with empty element is also supported.
### Outputs
output: T
Output the cumulative minimum elements of `input` in the dimension `dim`, with the same shape and dtype as `input`.
indices: tensor(int64)
Output the index location of each cumulative minimum value found in the dimension `dim`, with the same shape as `input`.
### Type Constraints - T:tensor(float32) ## Reminders - Operators endwith `*` are defined in Torch and are included here for the conversion to ONNX. ================================================ FILE: docs/en/docutils.conf ================================================ [html writers] table_style: colwidths-auto ================================================ FILE: docs/en/faq.md ================================================ ## Frequently Asked Questions We list some common troubles faced by many users and their corresponding solutions here. Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. ### Installation - KeyError: "xxx: 'yyy is not in the zzz registry'" The registry mechanism will be triggered only when the file of the module is imported. So you need to import that file somewhere. More details can be found at [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974). - "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'" 1. Uninstall existing mmcv in the environment using `pip uninstall mmcv` 2. Install mmcv-full following the [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) or [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) - "invalid device function" or "no kernel image is available for execution" 1. Check the CUDA compute capability of you GPU 2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture. You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. The compatibility issue could happen when using old GPUS, e.g., Tesla K80 (3.7) on colab. 3. Check whether the running environment is the same as that when mmcv/mmdet is compiled. For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0 environments - "undefined symbol" or "cannot open xxx.so" 1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check whether the CUDA/GCC runtimes are the same as those used for compiling mmcv 2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the Pytorch version is the same as that used for compiling mmcv 3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment - "RuntimeError: CUDA error: invalid configuration argument" This error may be caused by the poor performance of GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10) and recompile mmcv. - "RuntimeError: nms is not compiled with GPU support" This error is because your CUDA environment is not installed correctly. You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv. - "Segmentation fault" 1. Check your GCC version and use GCC >= 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause "segmentation fault" and simply changing it to GCC 5.4 could solve the problem 2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal and see whether they could correctly output results ```shell python -c 'import torch; print(torch.cuda.is_available())' ``` 3. If PyTorch is correctly installed, check whether MMCV is correctly installed. If MMCV is correctly installed, then there will be no issue of the command ```shell python -c 'import mmcv; import mmcv.ops' ``` 4. If MMCV and PyTorch are correctly installed, you can use `ipdb` to set breakpoints or directly add `print` to debug and see which part leads the `segmentation fault` - "libtorch_cuda_cu.so: cannot open shared object file" `mmcv-full` depends on the share object but it can not be found. We can check whether the object exists in `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` or try to re-install the PyTorch. - "fatal error C1189: #error: -- unsupported Microsoft Visual Studio version!" If you are building mmcv-full on Windows and the version of CUDA is 9.2, you will probably encounter the error `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error: -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`, in which case you can use a lower version of Microsoft Visual Studio like vs2017. - "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized" If your version of PyTorch is 1.5.0 and you are building mmcv-full on Windows, you will probably encounter the error `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`. The way to solve the error is to replace all the `static constexpr bool all_slots = false;` with `static bool all_slots = false;` at this file `https://github.com/pytorch/pytorch/blob/v1.5.0/torch/csrc/jit/api/module.h`. More details can be found at [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394). - "error: a member with an in-class initializer must be const" If your version of PyTorch is 1.6.0 and you are building mmcv-full on Windows, you will probably encounter the error `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. The way to solve the error is to replace all the `CONSTEXPR_EXCEPT_WIN_CUDA ` with `const` at `torch/include\torch/csrc/jit/api/module.h`. More details can be found at [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575). - "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized" If your version of PyTorch is 1.7.0 and you are building mmcv-full on Windows, you will probably encounter the error `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. The way to solve the error needs to modify several local files of PyTorch: - delete `static constexpr Symbol Kind = ::c10::prim::profile;` and `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` at `torch/include\torch/csrc/jit/ir/ir.h` - replace `explicit operator type&() { return *(this->value); }` with `explicit operator type&() { return *((type*)this->value); }` at `torch\include\pybind11\cast.h` - replace all the `CONSTEXPR_EXCEPT_WIN_CUDA` with `const` at `torch/include\torch/csrc/jit/api/module.h` More details can be found at [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956). - Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer" Please install the correct version of MMCV for the version of your MMDetection following the [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation). ### Usage - "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one" 1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode. More datails at [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582). 2. You can set ` find_unused_parameters = True` in the config to solve the above problems or find those unused parameters manually - "RuntimeError: Trying to backward through the graph a second time" `GradientCumulativeOptimizerHook` and `OptimizerHook` are both set which causes the `loss.backward()` to be called twice so `RuntimeError` was raised. We can only use one of these. More datails at [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379). ================================================ FILE: docs/en/get_started/api_reference.md ================================================ # API reference table Due to the removal of the `mmcv.fileio`, `mmcv.runner`, `mmcv.parallel`, `mmcv.engine`, `mmcv.device` modules, and all classes and most of the functions in the `mmcv.utils` module during the upgrade from MMCV v1.x to MMCV v2.x, which were removed at PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179), PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216), PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217). Therefore, we provide the following API reference table to make it easier to quickly find the migrated interfaces. ## Related issues, PRs and discussions - [Remove runner, parallel, engine and device](https://github.com/open-mmlab/mmcv/pull/2216) - [ImportError: cannot import name 'is_list_of' from 'mmcv.utils'](https://github.com/open-mmlab/mmcv/issues/2282) - [Could not find the files in MMengine which are removed in MMCV_v2x parallel. example, for DataContainer](https://github.com/open-mmlab/mmcv/issues/2934) - [mmcv.cnn.bricks.registry](https://github.com/open-mmlab/mmengine/discussions/1356) - [Replace mmcv's function and modules imported with mmengine's](https://github.com/open-mmlab/mmdetection/pull/8594) ## `mmcv.fileio` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ------------------------------------------------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | | mmcv.fileio.file_client.BaseStorageBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.base.BaseStorageBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/base.py | | mmcv.fileio.file_client.CephBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | | | | mmcv.fileio.file_client.PetrelBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.petrel_backend.PetrelBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/petrel_backend.py | | mmcv.fileio.file_client.MemcachedBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.memcached_backend.MemcachedBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/memcached_backend.py | | mmcv.fileio.file_client.LmdbBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.lmdb_backend.LmdbBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/lmdb_backend.py | | mmcv.fileio.file_client.HardDiskBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.file_client.HardDiskBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py | | mmcv.fileio.file_client.HTTPBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.http_backend.HTTPBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/http_backend.py | | mmcv.fileio.file_client.FileClient | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.file_client.FileClient | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py | | mmcv.fileio.io.load | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py | mmengine.fileio.io.load | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py | | mmcv.fileio.io.dump | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py | mmengine.fileio.io.dump | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py | | mmcv.fileio.io.\_register_handler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py | mmengine.fileio.handlers.\_register_handler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py | | mmcv.fileio.io.register_handler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py | mmengine.fileio.handlers.register_handler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py | | mmcv.fileio.parse.list_from_file | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py | mmengine.fileio.parse.list_from_file | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py | | mmcv.fileio.parse.dict_from_file | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py | mmengine.fileio.parse.dict_from_file | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py | | mmcv.fileio.handlers.base.BaseFileHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/base.py | mmengine.fileio.handlers.base.BaseFileHandler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/base.py | | mmcv.fileio.handlers.json_handler.set_default | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py | mmengine.fileio.handlers.json_handler.set_default | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py | | mmcv.fileio.handlers.json_handler.JsonHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py | mmengine.fileio.handlers.json_handler.JsonHandler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py | | mmcv.fileio.handlers.pickle_handler.PickleHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/pickle_handler.py | mmengine.fileio.handlers.pickle_handler.PickleHandler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/pickle_handler.py | | mmcv.fileio.handlers.yaml_handler.YamlHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/yaml_handler.py | mmengine.fileio.handlers.yaml_handler.YamlHandler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/yaml_handler.py | ## `mmcv.runner` | MMCV | MMCV URL | MMEngine | MMEngine URL | | --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | mmcv.runner.hooks.logger.base.LoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/base.py | mmengine.hooks.logger_hook.LoggerHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py | | mmcv.runner.hooks.logger.clearml.ClearMLLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/clearml.py | Similar: mmengine.visualization.vis_backend.ClearMLVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.dvclive.DvcliveLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/dvclive.py | Similar: mmengine.visualization.vis_backend.DVCLiveVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.mlflow.MlflowLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/mlflow.py | Similar: mmengine.visualization.vis_backend.MLflowVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.neptune.NeptuneLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/neptune.py | Similar: mmengine.visualization.vis_backend.NeptuneVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.pavi.PaviLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/pavi.py | | | | mmcv.runner.hooks.logger.segmind.SegmindLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/segmind.py | | | | mmcv.runner.hooks.logger.tensorboard.TensorboardLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/tensorboard.py | Similar: mmengine.visualization.vis_backend.TensorboardVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.text.TextLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/text.py | | | | mmcv.runner.hooks.logger.wandb.WandbLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/wandb.py | Similar: mmengine.visualization.vis_backend.WandbVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.checkpoint.CheckpointHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/checkpoint.py | mmengine.hooks.checkpoint_hook.CheckpointHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py | | mmcv.runner.hooks.closure.ClosureHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/closure.py | | | | mmcv.runner.hooks.ema.EMAHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/ema.py | mmengine.hooks.ema_hook.EMAHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/ema_hook.py | | mmcv.runner.hooks.evaluation.EvalHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py | Some features have been moved to: mmengine.hooks.checkpoint_hook.CheckpointHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py | | mmcv.runner.hooks.evaluation.DistEvalHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py | Some features have been moved to: mmengine.hooks.checkpoint_hook.CheckpointHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py | | mmcv.runner.hooks.hook.HOOKS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py | mmengine.registry.root.HOOKS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.runner.hooks.hook.Hook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py | mmengine.hooks.hook.Hook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py | | mmcv.runner.hooks.iter_timer.IterTimerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/iter_timer.py | mmengine.hooks.iter_timer_hook.IterTimerHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/iter_timer_hook.py | | mmcv.runner.hooks.lr_updater.LrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.LRSchedulerMixin | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.FixedLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.ConstantLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.StepLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.StepLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.ExpLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.ExponentialLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.PolyLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.PolyLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.InvLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | | | | mmcv.runner.hooks.lr_updater.CosineAnnealingUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.CosineAnnealingLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.FlatCosineAnnealingUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | | | | mmcv.runner.hooks.lr_updater.CosineRestartLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.get_position_from_periods | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR.get_position_from_periods | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.CyclicLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | | | | mmcv.runner.hooks.lr_updater.OneCycleLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.OneCycleLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.LinearAnnealingLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | | | | mmcv.runner.hooks.lr_updater.annealing_cos | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_annealing_cos | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.annealing_linear | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_annealing_linear | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.format_param | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_format_param | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.memory.EmptyCacheHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/memory.py | mmengine.hoos.empty_cache_hook.EmptyCacheHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/empty_cache_hook.py | | mmcv.runner.hooks.momentum_updater.MomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | mmengine.optim.scheduler.momentum_scheduler.MomentumSchedulerMixin | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py | | mmcv.runner.hooks.momentum_updater.StepMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | mmengine.optim.scheduler.momentum_scheduler.StepMomentum | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py | | mmcv.runner.hooks.momentum_updater.CosineAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | mmengine.optim.scheduler.momentum_scheduler.CosineAnnealingMomentum | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py | | mmcv.runner.hooks.momentum_updater.LinearAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | | | | mmcv.runner.hooks.momentum_updater.CyclikcMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | | | | mmcv.runner.hooks.momentum_updater.OneCycleMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | | | | mmcv.runner.hooks.optimizer.OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | mmengine.optimizer.optimizer_wrapper.OptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/optimizer_wrapper.py | | mmcv.runner.hooks.optimizer.GradientCumulativeOptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | | | | mmcv.runner.hooks.optimizer.Fp16OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | Moved to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | | | | mmcv.runner.hooks.optimizer.Fp16OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | Moved to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | | | | mmcv.runner.hooks.profiler.ProfilerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/profiler.py | mmengine.hooks.profiler_hook.ProfilerHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/profiler_hook.py | | mmcv.runner.hooks.sampler_seed.DistSamplerSeedHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sampler_seed.py | mmengine.hooks.sampler_seed_hook.DistSamplerSeedHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sampler_seed_hook.py | | mmcv.runner.hooks.sync_buffer.SyncbuffersHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sync_buffer.py | mmengine.hooks.sync_buffer_hook.SyncBufferHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sync_buffer_hook.py | | mmcv.runner.optimizer.builder.OPTIMIZERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | mmengine.registry.root.OPTIMIZERS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.runner.optimizer.builder.OPTIMIZER_BUILDERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | | | | mmcv.runner.optimizer.builder.register_torch_optimizers | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | mmengine.optim.optimizer.builder.register_torch_optimizers | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/builder.py | | mmcv.runner.optimizer.builder.TORCH_OPTIMIZERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | | | | mmcv.runner.optimizer.builder.build_optimizer_constructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | | | | mmcv.runner.optimizer.builder.build_optimizer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | | | | mmcv.runner.optimizer.default_constructor.DefaultOptimizerConstructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/default_constructor.py | | | | mmcv.runner.base_module.BaseModule | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py | mmengine.model.base_module.BaseModule | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py | | mmcv.runner.base_module.Sequential | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py | mmengine.model.base_module.Sequential | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py | | mmcv.runner.base_module.ModuleList | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py | mmengine.model.base_module.ModuleList | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py | | mmcv.runner.base_module.ModuleDict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py | mmengine.model.base_module.ModuleDict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py | | mmcv.runner.base_runner.BaseRunner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_runner.py | mmengine.runner.runner.Runner | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py | | mmcv.runner.builder.RUNNERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py | mmengine.registry.root.RUNNERS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.runner.builder.RUNNER_BUILDERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py | mmengine.registry.root.RUNNER_CONSTRUCTORS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.runner.builder.build_runner_constructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py | | | | mmcv.runner.builder.build_runner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py | | | | mmcv.runner.checkpoint.ENV_MMCV_HOME | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.ENV_MMENGINE_HOME | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.ENV_XDG_CACHE_HOME | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.ENV_XDG_CACHE_HOME | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.DEFAULT_CACHE_HOME | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.DEFAULT_CACHE_DIR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_get_mmcv_home | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_get_mmengine_home | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_state_dict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_state_dict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_torchvision_models | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_torchvision_models | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_external_models | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_external_models | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_mmcls_models | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_mmcls_models | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_deprecated_model_names | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_deprecated_model_names | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_process_mmcls_checkpoint | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_process_mmcls_checkpoint | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.CheckpointLoader | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.CheckpointLoader | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_local | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_local | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_http | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_http | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_pavi | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_pavi | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_ceph | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_ceph | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_torchvision | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_torchvision | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_openmmlab | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_openmmlab | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_mmcls | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_mmcls | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_load_checkpoint | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_load_checkpoint | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_load_checkpoint_with_prefix | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_load_checkpoint_with_prefix | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_checkpoint | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_checkpoint | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.weights_to_cpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.weights_to_cpu | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_save_to_state_dict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_save_to_state_dict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_state_dict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_state_dict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.save_checkpoint | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.save_checkpoint | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.default_coonstructor.DefaultRunnerConstructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/default_constructor.py | | | | mmcv.runner.dist_utils.\_find_free_port | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.dist_utils.\_is_free_port | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.dist_utils.init_dist | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.init_dist | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.\_init_dist_pytorch | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.\_init_dist_pytorch | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.\_init_dist_mpi | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.\_init_dist_mpi | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.\_init_dist_slurm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.\_init_dist_slurm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.get_dist_info | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.get_dist_info | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.master_only | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.master_only | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.allreduce_params | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.dist_utils.allreduce_grads | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.dist_utils.\_allreduce_coalesced | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.epoch_based_runner.EpochBasedRunner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py | mmengine.runner.loops.EpochBasedTrainLoop | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py | | mmcv.runner.epoch_based_runner.Runner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py | | | | mmcv.runner.fp16_utils.cast_tensor_type | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.auto_fp16 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.force_fp32 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.allreduce_grads | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.wrap_fp16_model | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.patch_norm_fp32 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.patch_forward_method | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.LossScaler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | Moved fp16-related to: mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper and mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.iter_based_runner.IterLoader | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py | | | | mmcv.runner.iter_based_runner.IterBasedRunner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py | mmengine.runner.loops.IterBasedTrainLoop | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py | | mmcv.runner.log_buffer.LogBuffer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/log_buffer.py | | | | mmcv.runner.priority.Priority | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py | mmengine.runer.priority.Priority | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py | | mmcv.runner.priority.get_priority | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py | mmengine.runner.priority.get_priority | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py | | mmcv.runner.utils.get_host_info | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py | | | | mmcv.runner.utils.get_time_str | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py | | | | mmcv.runner.utils.obj_from_dict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py | | | | mmcv.runner.utils.set_random_seed | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py | mmengine.runner.utils.set_random_seed | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/utils.py | ## `mmcv.parallel` | MMCV | MMCV URL | MMEngine | MMEngine URL | | -------------------------------------------------------------- | ------------------------------------------------------------------------------ | -------------------------------------------------------------- | ----------------------------------------------------------------------------------------- | | mmcv.parallel.\_functions.scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py | | | | mmcv.parallel.\_functions.synchronize_stream | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py | | | | mmcv.parallel.\_functions.get_input_device | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py | | | | mmcv.parallel.\_functions.Scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py | | | | mmcv.parallel.collate.collate | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/collate.py | | | | mmcv.parallel.data_container.assert_tensor_type | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py | | | | mmcv.parallel.data_container.DataContainer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py | Similar: mmengine/structures/base_data_element.BaseDataElement | https://github.com/open-mmlab/mmengine/blob/main/mmengine/structures/base_data_element.py | | mmcv.parallel.data_parallel.MMDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_parallel.py | | | | mmcv.parallel.distributed.MMDistributedDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py | mmengine.model.wrappers.distributed.MMDistributedDataParallel | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py | | mmcv.parallel.distributed_deprecated.MMDistributedDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py | mmengine.model.wrappers.distributed.MMDistributedDataParallel | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py | | mmcv.parallel.registry.MODULE_WRAPPERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/registry.py | mmengine.registry.root.MODEL_WRAPPERS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.parallel.scatter_gather.scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py | | | | mmcv.parallel.scatter_gather.scatter_kwargs | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py | | | | mmcv.parallel.utils.is_module_wrapper | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/utils.py | mmengine.model.wrappers.utils.is_model_wrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/utils.py | ## `mmcv.engine` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ------------------------------------ | ------------------------------------------------------------------ | -------- | ------------ | | mmcv.engine.test.single_gpu_test | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py | | | | mmcv.engine.test.multi_gpu_test | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py | | | | mmcv.engine.test.collect_results_cpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py | | | | mmcv.engine.test.collect_results_gpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py | | | ## `mmcv.device` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ----------------------------------------- | ---------------------------------------------------------------------------- | -------------------------------- | ------------------------------------------------------------------------- | | mmcv.device.ipu | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/ipu | | | | mmcv.device.mlu | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mlu | | | | mmcv.device.mps | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mps | | | | mmcv.device.npu | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/npu | | | | mmcv.device.\_functions.scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py | | | | mmcv.device.\_functions.Scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py | | | | mmcv.device.scatter_gather.scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py | | | | mmcv.device.scatter_gather.scatter_kwargs | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py | | | | mmcv.device.utils.get_device | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/utils.py | mmengine.device.utils.get_device | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | ## `mmcv.utils` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | | mmcv.utils.config.BASE_KEY | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.BASE_KEY | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.DELETE_KEY | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.DELETE_KEY | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.DEPRECATION_KEY | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.DEPRECATION_KEY | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.ConfigDict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.ConfigDict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.add_args | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.add_args | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.Config | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.Config | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.DictAction | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.DictAction | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.device_type.is_ipu_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | | | | mmcv.utils.device_type.IS_IPU_AVAILABLE | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | | | | mmcv.utils.device_type.is_mlu_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | mmengine.device.utils.is_mlu_available | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | | mmcv.utils.device_type.is_mps_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | mmengine.device.utils.is_mps_available | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | | mmcv.utils.device_type.is_npu_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | mmengine.device.utils.is_npu_available | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | | mmcv.utils.hub.\_is_legacy_zip_format | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py | mmengine.utils.dl_utils.hub.\_is_legacy_zip_format | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py | | mmcv.utils.hub.\_legacy_zip_load | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py | mmengine.utils.dl_utils.hub.\_legacy_zip_load | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py | | mmcv.utils.hub.load_url | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py | mmengine.utils.dl_utils.hub.load_url | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py | | mmcv.utils.logging.logger_initialized | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py | | | | mmcv.utils.logging.get_logger | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py | | | | mmcv.utils.logging.print_log | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py | | | | mmcv.utils.misc.\_ntuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.\_ntuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_1tuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_1tuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_2tuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_2tuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_3tuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_3tuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_4tuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_4tuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_ntuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_ntuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_str | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_str | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.import_modules_from_strings | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.import_modules_from_strings | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.iter_cast | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.iter_cast | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.list_cast | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.list_cast | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.tuple_cast | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.tuple_cast | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_seq_of | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_seq_of | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_list_of | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_list_of | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_tuple_of | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_tuple_of | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.slice_list | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.slice_list | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.concat_list | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.concat_list | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.check_prerequisites | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.check_prerequisites | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.\_check_py_package | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.\_check_py_package | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.\_check_executable | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.\_check_executable | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.requires_package | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.requires_package | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.requires_executable | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.requires_executable | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.deprecated_api_warning | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.deprecated_api_warning | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_method_overridden | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_method_overridden | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.has_method | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.has_method | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.parrots_wrapper.TORCH_VERSION | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.TORCH_VERSION | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.is_cuda_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.devices.utils.is_cuda_available | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | | mmcv.utils.parrots_wrapper.IS_CUDA_AVAILABLE | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | | | | mmcv.utils.parrots_wrapper.is_rocm_pytorch | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.is_rocm_pytorch | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_cuda_home | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_cuda_home | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.get_build_config | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.get_build_config | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_conv | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_conv | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_dataloader | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_dataloader | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_extension | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_extension | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_pool | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_pool | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_norm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_norm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.SyncBatchNorm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.SyncBatchNorm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.path.is_filepath | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.is_filepath | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.fopen | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.fopen | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.check_file_exist | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.path.check_file_exist | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.mkdir_or_exist | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.mkdir_or_exist | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.symlink | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.symlink | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.scandir | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.scandir | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.find_vcs_root | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.find_vcs_root | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.progressbar.ProgressBar | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.ProgressBar | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.progressbar.track_progress | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.track_progress | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.progressbar.init_pool | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.init_pool | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.progressbar.track_parallel_progress | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.track_parallel_progress | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.progressbar.track_iter_progress | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.track_iter_progress | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.registry.build_from_cfg | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py | mmengine.registry.build_functions.build_from_cfg | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/build_functions.py | | mmcv.utils.registry.Registry | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py | mmengine.registry.registry.Registry | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/registry.py | | mmcv.utils.seed.worker_init_fn | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/seed.py | mmengine.dataset.utils.worker_init_fn | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/utils.py | | mmcv.utils.testing.check_python_script | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.check_python_script | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.\_any | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.\_any | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_dict_contains_subset | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_dict_contains_subset | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_attrs_equal | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_attrs_equal | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_dict_has_keys | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_dict_has_keys | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_keys_equal | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_keys_equal | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_is_norm_layer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_is_norm_layer | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_params_all_zeros | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_params_all_zeros | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.timer.TimerError | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py | mmengine.utils.timer.TimerError | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py | | mmcv.utils.timer.Timer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py | mmengine.utils.timer.Timer | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py | | mmcv.utils.timer.\_g_timers | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py | mmengine.utils.timer.\_g_timers | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py | | mmcv.utils.timer.check_time | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py | mmengine.utils.timer.check_time | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py | | mmcv.utils.torch_ops.\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py | mmengine.utils.dl_utils.torch_ops.\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py | | mmcv.utils.torch_ops.torch_meshgrid | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py | mmengine.utils.dl_utils.torch_ops.torch_meshgrid | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py | | mmcv.utils.trace.is_jit_tracing | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/trace.py | mmengine.utils.dl_utils.trace.is_jit_tracing | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/trace.py | | mmcv.utils.version_utils.digit_version | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py | mmengine.utils.version_utils.digit_version | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py | | mmcv.utils.version_utils.\_minimal_ext_cmd | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py | mmengine.utils.version_utils.\_minimal_ext_cmd | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py | | mmcv.utils.version_utils.get_git_hash | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py | mmengine.utils.version_utils.get_git_hash | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py | ## `mmcv.cnn` | MMCV | MMCV URL | MMEngine | MMEngine URL | | -------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------------------ | | mmcv.cnn.utils.sync_bn.\_BatchNormXd | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.\_BatchNormXd | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py | | mmcv.cnn.utils.sync_bn.revert_sync_batchnorm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.revert_sync_batchnorm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py | ## `mmcv.model_zoo` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ------------------------------------ | ----------------------------------------------------------------------------------- | ---------------------------------- | ----------------------------------------------------------------------------------- | | mmcv.model_zoo.deprecated.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/deprecated.json | mmengine.hub.deprecated.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/deprecated.json | | mmcv.model_zoo.mmcls.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/mmcls.json | mmengine.hub.mmcls.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/mmcls.json | | mmcv.model_zoo.open_mmlab.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/open_mmlab.json | mmengine.hub.openmmlab.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/openmmlab.json | | mmcv.model_zoo.torchvision_0.12.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/torchvision_0.12.json | mmengine.hub.torchvision_0.12.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/torchvision_0.12.json | ================================================ FILE: docs/en/get_started/build.md ================================================ ## Build MMCV from source ### Build mmcv Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command ```bash python -c 'import torch;print(torch.__version__)' ``` If version information is output, then PyTorch is installed. ```{note} If you would like to use `opencv-python-headless` instead of `opencv-python`, e.g., in a minimum container environment or servers without GUI, you can first install it before installing MMCV to skip the installation of `opencv-python`. ``` #### Build on Linux 1. Clone the repo ```bash git clone https://github.com/open-mmlab/mmcv.git cd mmcv ``` 2. Install `ninja` and `psutil` to speed up the compilation ```bash pip install -r requirements/optional.txt ``` 3. Check the nvcc version (requires 9.2+. Skip if no GPU available.) ```bash nvcc --version ``` If the above command outputs the following message, it means that the nvcc setting is OK, otherwise you need to set CUDA_HOME. ``` nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2020 NVIDIA Corporation Built on Mon_Nov_30_19:08:53_PST_2020 Cuda compilation tools, release 11.2, V11.2.67 Build cuda_11.2.r11.2/compiler.29373293_0 ``` :::{note} If you want to support ROCm, you can refer to [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) to install ROCm. ::: 4. Check the gcc version (requires 5.4+) ```bash gcc --version ``` 5. Start building (takes 10+ min) ```bash pip install -e . -v ``` 6. Validate the installation ```bash python .dev_scripts/check_installation.py ``` If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution. If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues). #### Build on macOS ```{note} If you are using a mac with apple silicon chip, install the PyTorch 1.13+, otherwise you will encounter the problem in [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218). ``` 1. Clone the repo ```bash git clone https://github.com/open-mmlab/mmcv.git cd mmcv ``` 2. Install `ninja` and `psutil` to speed up the compilation ```bash pip install -r requirements/optional.txt ``` 3. Start building ```bash MMCV_WITH_OPS=1 pip install -e . ``` 4. Validate the installation ```bash python .dev_scripts/check_installation.py ``` If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution. If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues). #### Build on Windows Building MMCV on Windows is a bit more complicated than that on Linux. The following instructions show how to get this accomplished. ##### Prerequisite The following software is required for building MMCV on windows. Install them first. - [Git](https://git-scm.com/download/win) - During installation, tick **add git to Path**. - [Visual Studio Community 2019](https://visualstudio.microsoft.com) - A compiler for C++ and CUDA codes. - [Miniconda](https://docs.conda.io/en/latest/miniconda.html) - Official distributions of Python should work too. - [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive) - Not required for building CPU version. - Customize the installation if necessary. As a recommendation, skip the driver installation if a newer version is already installed. ```{note} You should know how to set up environment variables, especially `Path`, on Windows. The following instruction relies heavily on this skill. ``` ##### Common steps 1. Launch Anaconda prompt from Windows Start menu Do not use raw `cmd.exe` s instruction is based on PowerShell syntax. 2. Create a new conda environment ```powershell (base) PS C:\Users\xxx> conda create --name mmcv python=3.7 (base) PS C:\Users\xxx> conda activate mmcv # make sure to activate environment before any operation ``` 3. Install PyTorch. Choose a version based on your need. ```powershell # CUDA version (mmcv) PS C:\Users\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch # CPU version (mmcv) PS C:\Users\xxx> conda install install pytorch torchvision cpuonly -c pytorch ``` 4. Clone the repo ```powershell (mmcv) PS C:\Users\xxx> git clone https://github.com/open-mmlab/mmcv.git (mmcv) PS C:\Users\xxx\mmcv> cd mmcv ``` 5. Install `ninja` and `psutil` to speed up the compilation ```powershell (mmcv) PS C:\Users\xxx\mmcv> pip install -r requirements/optional.txt ``` 6. Set up MSVC compiler Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below. ```powershell (mmcv) PS C:\Users\xxx\mmcv> cl Microsoft (R) C/C++ Optimizing Compiler Version 19.27.29111 for x64 Copyright (C) Microsoft Corporation. All rights reserved. usage: cl [ option... ] filename... [ / link linkoption... ] ``` For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path. You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English. ##### Build and install MMCV mmcv can be built in two ways: 1. Full version (CPU ops) Module `ops` will be compiled as a pytorch extension, but only x86 code will be compiled. The compiled ops can be executed on CPU only. 2. Full version (CUDA ops) Both x86 and CUDA codes of `ops` module will be compiled. The compiled version can be run on both CPU and CUDA-enabled GPU (if implemented). ###### CPU version Build and install ```powershell (mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext (mmcv) PS C:\Users\xxx\mmcv> python setup.py develop ``` ###### GPU version 1. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below: ```powershell (mmcv) PS C:\Users\xxx\mmcv> ls env: Name Value ---- ----- CUDA_PATH C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 CUDA_PATH_V10_1 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 CUDA_PATH_V10_2 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 ``` This should already be done by CUDA installer. If not, or you have multiple version of CUDA toolkit installed, set it with ```powershell (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" # OR (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # if CUDA_PATH_V10_2 is in envs: ``` 2. Set CUDA target arch ```shell # Here you need to change to the target architecture corresponding to your GPU (mmcv) PS C:\Users\xxx\mmcv> $env:TORCH_CUDA_ARCH_LIST="7.5" ``` :::{note} Check your the compute capability of your GPU from [here](https://developer.nvidia.com/cuda-gpus). ```powershell (mmcv) PS C:\Users\xxx\mmcv> &"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\demo_suite\deviceQuery.exe" Device 0: "NVIDIA GeForce GTX 1660 SUPER" CUDA Driver Version / Runtime Version 11.7 / 11.1 CUDA Capability Major/Minor version number: 7.5 ``` The 7.5 above indicates the target architecture. Note: You need to replace v10.2 with your CUDA version in the above command. ::: 3. Build and install ```powershell # build python setup.py build_ext # if success, cl will be launched to compile ops # install python setup.py develop ``` ```{note} If you are compiling against PyTorch 1.6.0, you might meet some errors from PyTorch as described in [this issue](https://github.com/pytorch/pytorch/issues/42467). Follow [this pull request](https://github.com/pytorch/pytorch/pull/43380/files) to modify the source code in your local PyTorch installation. ``` ##### Validate installation ```powershell (mmcv) PS C:\Users\xxx\mmcv> python .dev_scripts/check_installation.py ``` If no error is reported by the above command, the installation is successful. If there is an error reported, please check [Frequently Asked Questions](../faq.md) to see if there is already a solution. If no solution is found, please feel free to open an [issue](https://github.com/open-mmlab/mmcv/issues). ### Build mmcv-lite If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation). 1. Clone the repo ```bash git clone https://github.com/open-mmlab/mmcv.git cd mmcv ``` 2. Start building ```bash MMCV_WITH_OPS=0 pip install -e . -v ``` 3. Validate installation ```bash python -c 'import mmcv;print(mmcv.__version__)' ``` ### Build mmcv-full on Cambricon MLU Devices #### Install torch_mlu ##### Option1: Install mmcv-full based on Cambricon docker image Firstly, install and pull Cambricon docker image (please email service@cambricon.com for the latest release docker): ```bash docker pull ${docker image} ``` Run and attach to the docker, [Install mmcv-full on MLU device](#install-mmcv\-full-on-cambricon-mlu-device) and [make sure you've installed mmcv-full on MLU device successfully](#test-code) ##### Option2: Install mmcv-full from compiling Cambricon PyTorch source code Please email service@cambricon.com or contact with Cambricon engineers for a suitable version of CATCH package. After you get the suitable version of CATCH package, please follow the steps in ${CATCH-path}/CONTRIBUTING.md to install Cambricon PyTorch. #### Install mmcv-full on Cambricon MLU device Clone the repo ```bash git clone https://github.com/open-mmlab/mmcv.git ``` The mlu-ops library will be downloaded to the default directory (mmcv/mlu-ops) while building MMCV. You can also set `MMCV_MLU_OPS_PATH` to an existing mlu-ops library before building as follows: ```bash export MMCV_MLU_OPS_PATH=/xxx/xxx/mlu-ops ``` Install mmcv-full ```bash cd mmcv export MMCV_WITH_OPS=1 export FORCE_MLU=1 python setup.py install ``` #### Test Code After finishing previous steps, you can run the following python code to make sure that you've installed mmcv-full on MLU device successfully ```python import torch import torch_mlu from mmcv.ops import sigmoid_focal_loss x = torch.randn(3, 10).mlu() x.requires_grad = True y = torch.tensor([1, 5, 3]).mlu() w = torch.ones(10).float().mlu() output = sigmoid_focal_loss(x, y, 2.0, 0.25, w, 'none') print(output) ``` ================================================ FILE: docs/en/get_started/installation.md ================================================ ## Installation There are two versions of MMCV: - **mmcv**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build. - **mmcv-lite**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops. ```{warning} Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is avaliable`. ``` ### Install mmcv Before installing mmcv, make sure that PyTorch has been successfully installed following the [PyTorch official installation guide](https://pytorch.org/get-started/locally/#start-locally). This can be verified using the following command ```bash python -c 'import torch;print(torch.__version__)' ``` If version information is output, then PyTorch is installed. #### Install with mim (recommended) [mim](https://github.com/open-mmlab/mim) is the package management tool for the OpenMMLab projects, which makes it easy to install mmcv ```bash pip install -U openmim mim install mmcv ``` If you find that the above installation command does not use a pre-built package ending with `.whl` but a source package ending with `.tar.gz`, you may not have a pre-build package corresponding to the PyTorch or CUDA or mmcv version, in which case you can [build mmcv from source](build.md).
Installation log using pre-built packages Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
Collecting mmcv
Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl
Installation log using source packages Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
Collecting mmcv==2.0.0
Downloading mmcv-2.0.0.tar.gz
To install a specific version of mmcv, for example, mmcv version 2.0.0, you can use the following command ```bash mim install mmcv==2.0.0 ``` :::{note} If you would like to use `opencv-python-headless` instead of `opencv-python`, e.g., in a minimum container environment or servers without GUI, you can first install it before installing MMCV to skip the installation of `opencv-python`. Alternatively, if it takes too long to install a dependency library, you can specify the pypi source ```bash mim install mmcv -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ::: You can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands. #### Install with pip Use the following command to check the version of CUDA and PyTorch ```bash python -c 'import torch;print(torch.__version__);print(torch.version.cuda)' ``` Select the appropriate installation command depending on the type of system, CUDA version, PyTorch version, and MMCV version





If you do not find a corresponding version in the dropdown box above, you probably do not have a pre-built package corresponding to the PyTorch or CUDA or mmcv version, at which point you can [build mmcv from source](build.md).

:::{note}
mmcv is only compiled on PyTorch 1.x.0 because the compatibility
usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you
can install mmcv compiled with PyTorch 1.x.0 and it usually works well.
For example, if your PyTorch version is 1.8.1, you can feel free to choose 1.8.x.
:::

:::{note}
If you would like to use `opencv-python-headless` instead of `opencv-python`,
e.g., in a minimum container environment or servers without GUI,
you can first install it before installing MMCV to skip the installation of `opencv-python`.

Alternatively, if it takes too long to install a dependency library, you can specify the pypi source

```bash
mim install mmcv -i https://pypi.tuna.tsinghua.edu.cn/simple
```

:::

You can run [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) to check the installation of mmcv after running the installation commands.

#### Using mmcv with Docker

Build with local repository

```bash
git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
docker build -t mmcv -f docker/release/Dockerfile .
```

Or build with remote repository

```bash
docker build -t mmcv https://github.com/open-mmlab/mmcv.git#main:docker/release
```

The [Dockerfile](release/Dockerfile) installs latest released version of mmcv-full by default, but you can specify mmcv versions to install expected versions.

```bash
docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0 .
```

If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.

An example to build an image with PyTorch 1.11 and CUDA 11.3.

```bash
docker build -t mmcv -f docker/release/Dockerfile \
    --build-arg PYTORCH=1.11.0 \
    --build-arg CUDA=11.3 \
    --build-arg CUDNN=8 \
    --build-arg MMCV=2.0.0 .
```

More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).

### Install mmcv-lite

If you need to use PyTorch-related modules, make sure PyTorch has been successfully installed in your environment by referring to the [PyTorch official installation guide](https://github.com/pytorch/pytorch#installation).

```python
pip install mmcv-lite
```


================================================
FILE: docs/en/get_started/introduction.md
================================================
## Introduction

MMCV is a foundational library for computer vision research and provides the following functionalities.

- [Image/Video processing](../understand_mmcv/data_process.md)
- [Image and annotation visualization](../understand_mmcv/visualization.md)
- [Image transformation](../understand_mmcv/data_transform.md)
- [Various CNN architectures](../understand_mmcv/cnn.md)
- [High-quality implementation of common CUDA ops](../understand_mmcv/ops.md)

It supports the following systems:

- Linux
- Windows
- macOS

It supports many research projects as below:

- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.


================================================
FILE: docs/en/get_started/previous_versions.md
================================================
## OTHER VERSIONS OF PYTORCH BUILT FOR MMCV-FULL

We no longer provide `mmcv-full` packages compiled under lower versions of `PyTorch`, but for your convenience, you can find them below.

### PyTorch 1.4

| 1.0.0 \<= mmcv_version \<= 1.2.1

#### CUDA 10.1

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
```

#### CUDA 9.2

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
```

#### CPU

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.4.0/index.html
```

### PyTorch v1.3

| 1.0.0 \<= mmcv_version \<= 1.3.16

#### CUDA 10.1

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
```

#### CUDA 9.2

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
```

#### CPU

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.3.0/index.html
```


================================================
FILE: docs/en/index.rst
================================================
Welcome to MMCV's documentation!
================================

You can switch between Chinese and English documents in the lower-left corner of the layout.

.. toctree::
   :maxdepth: 2
   :caption: Get Started

   get_started/introduction.md
   get_started/installation.md
   get_started/build.md
   get_started/api_reference.md

.. toctree::
   :maxdepth: 2
   :caption: Understand MMCV

   understand_mmcv/data_process.md
   understand_mmcv/data_transform.md
   understand_mmcv/visualization.md
   understand_mmcv/cnn.md
   understand_mmcv/ops.md

.. toctree::
   :maxdepth: 2
   :caption: Deployment

   deployment/mmcv_ops_definition.md

.. toctree::
   :caption: Switch Language

   switch_language.md

.. toctree::
   :maxdepth: 2
   :caption: Compatibility

   compatibility.md

.. toctree::

   faq.md

.. toctree::
   :maxdepth: 2
   :caption: Community

   community/contributing.md
   community/pr.md

.. toctree::
   :maxdepth: 1
   :caption: API Reference

   mmcv.image 
   mmcv.video 
   mmcv.visualization 
   mmcv.cnn 
   mmcv.ops 
   mmcv.transforms 
   mmcv.arraymisc 
   mmcv.utils 

Indices and tables
==================

* :ref:`genindex`
* :ref:`search`


================================================
FILE: docs/en/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%

:end
popd


================================================
FILE: docs/en/switch_language.md
================================================
## English

## 简体中文


================================================
FILE: docs/en/understand_mmcv/cnn.md
================================================
## CNN

We provide some building bricks for CNNs, including layer building, module bundles and weight initialization.

### Layer building

We may need to try different layers of the same type when running experiments,
but do not want to modify the code from time to time.
Here we provide some layer building methods to construct layers from a dict,
which can be written in configs or specified via command line arguments.

#### Usage

A simplest example is

```python
from mmcv.cnn import build_conv_layer

cfg = dict(type='Conv3d')
layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
```

- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d).
- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d).
- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU.
- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle.
- `build_padding_layer`: Supported types are zero, reflect, replicate.

#### Extension

We also allow extending the building methods with custom layers and operators.

1. Write and register your own module.

   ```python
   from mmengine.registry import MODELS

   @MODELS.register_module()
   class MyUpsample:

       def __init__(self, scale_factor):
           pass

       def forward(self, x):
           pass
   ```

2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it.

   ```python
   from mmcv.cnn import build_upsample_layer

   cfg = dict(type='MyUpsample', scale_factor=2)
   layer = build_upsample_layer(cfg)
   ```

### Module bundles

We also provide common module bundles to facilitate the network construction.
`ConvModule` is a bundle of convolution, normalization and activation layers,
please refer to the [api](api.html#mmcv.cnn.ConvModule) for details.

```python
from mmcv.cnn import ConvModule

# conv + bn + relu
conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
# conv + gn + relu
conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
# conv + relu
conv = ConvModule(3, 8, 2)
# conv
conv = ConvModule(3, 8, 2, act_cfg=None)
# conv + leaky relu
conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
# bn + conv + relu
conv = ConvModule(
    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
```


================================================
FILE: docs/en/understand_mmcv/data_process.md
================================================
## Data Process

### Image

This module provides some image processing methods, which requires `opencv` to be installed first.

#### Read/Write/Show

To read or write images files, use `imread` or `imwrite`.

```python
import mmcv

img = mmcv.imread('test.jpg')
img = mmcv.imread('test.jpg', flag='grayscale')
img_ = mmcv.imread(img)  # nothing will happen, img_ = img
mmcv.imwrite(img, 'out.jpg')
```

To read images from bytes

```python
with open('test.jpg', 'rb') as f:
    data = f.read()
img = mmcv.imfrombytes(data)
```

To show an image file or a loaded image

```python
mmcv.imshow('tests/data/color.jpg')
# this is equivalent to

for i in range(10):
    img = np.random.randint(256, size=(100, 100, 3), dtype=np.uint8)
    mmcv.imshow(img, win_name='test image', wait_time=200)
```

#### Color space conversion

Supported conversion methods:

- bgr2gray
- gray2bgr
- bgr2rgb
- rgb2bgr
- bgr2hsv
- hsv2bgr

```python
img = mmcv.imread('tests/data/color.jpg')
img1 = mmcv.bgr2rgb(img)
img2 = mmcv.rgb2gray(img1)
img3 = mmcv.bgr2hsv(img)
```

#### Resize

There are three resize methods. All `imresize_*` methods have an argument `return_scale`,
if this argument is `False`, then the return value is merely the resized image, otherwise
is a tuple `(resized_img, scale)`.

```python
# resize to a given size
mmcv.imresize(img, (1000, 600), return_scale=True)

# resize to the same size of another image
mmcv.imresize_like(img, dst_img, return_scale=False)

# resize by a ratio
mmcv.imrescale(img, 0.5)

# resize so that the max edge no longer than 1000, short edge no longer than 800
# without changing the aspect ratio
mmcv.imrescale(img, (1000, 800))
```

#### Rotate

To rotate an image by some angle, use `imrotate`. The center can be specified,
which is the center of original image by default. There are two modes of rotating,
one is to keep the image size unchanged so that some parts of the image will be
cropped after rotating, the other is to extend the image size to fit the rotated
image.

```python
img = mmcv.imread('tests/data/color.jpg')

# rotate the image clockwise by 30 degrees.
img_ = mmcv.imrotate(img, 30)

# rotate the image counterclockwise by 90 degrees.
img_ = mmcv.imrotate(img, -90)

# rotate the image clockwise by 30 degrees, and rescale it by 1.5x at the same time.
img_ = mmcv.imrotate(img, 30, scale=1.5)

# rotate the image clockwise by 30 degrees, with (100, 100) as the center.
img_ = mmcv.imrotate(img, 30, center=(100, 100))

# rotate the image clockwise by 30 degrees, and extend the image size.
img_ = mmcv.imrotate(img, 30, auto_bound=True)
```

#### Flip

To flip an image, use `imflip`.

```python
img = mmcv.imread('tests/data/color.jpg')

# flip the image horizontally
mmcv.imflip(img)

# flip the image vertically
mmcv.imflip(img, direction='vertical')
```

#### Crop

`imcrop` can crop the image with one or more regions. Each region is represented by the upper left and lower right coordinates as (x1, y1, x2, y2).

```python
import mmcv
import numpy as np

img = mmcv.imread('tests/data/color.jpg')

# crop the region (10, 10, 100, 120)
bboxes = np.array([10, 10, 100, 120])
patch = mmcv.imcrop(img, bboxes)

# crop two regions (10, 10, 100, 120) and (0, 0, 50, 50)
bboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])
patches = mmcv.imcrop(img, bboxes)

# crop two regions, and rescale the patches by 1.2x
patches = mmcv.imcrop(img, bboxes, scale=1.2)
```

#### Padding

There are two methods, `impad` and `impad_to_multiple`, to pad an image to the
specific size with given values.

```python
img = mmcv.imread('tests/data/color.jpg')

# pad the image to (1000, 1200) with all zeros
img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)

# pad the image to (1000, 1200) with different values for three channels.
img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))

# pad the image on left, right, top, bottom borders with all zeros
img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)

# pad the image on left, right, top, bottom borders with different values
# for three channels.
img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))

# pad an image so that each edge is a multiple of some value.
img_ = mmcv.impad_to_multiple(img, 32)
```

### Video

This module provides the following functionalities:

- A `VideoReader` class with friendly apis to read and convert videos.
- Some methods for editing (cut, concat, resize) videos.
- Optical flow read/write/warp.

#### VideoReader

The `VideoReader` class provides sequence like apis to access video frames.
It will internally cache the frames which have been visited.

```python
video = mmcv.VideoReader('test.mp4')

# obtain basic information
print(len(video))
print(video.width, video.height, video.resolution, video.fps)

# iterate over all frames
for frame in video:
    print(frame.shape)

# read the next frame
img = video.read()

# read a frame by index
img = video[100]

# read some frames
img = video[5:10]
```

To convert a video to images or generate a video from a image directory.

```python
# split a video into frames and save to a folder
video = mmcv.VideoReader('test.mp4')
video.cvt2frames('out_dir')

# generate video from frames
mmcv.frames2video('out_dir', 'test.avi')
```

#### Editing utils

There are also some methods for editing videos, which wraps the commands of ffmpeg.

```python
# cut a video clip
mmcv.cut_video('test.mp4', 'clip1.mp4', start=3, end=10, vcodec='h264')

# join a list of video clips
mmcv.concat_video(['clip1.mp4', 'clip2.mp4'], 'joined.mp4', log_level='quiet')

# resize a video with the specified size
mmcv.resize_video('test.mp4', 'resized1.mp4', (360, 240))

# resize a video with a scaling ratio of 2
mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)
```

#### Optical flow

`mmcv` provides the following methods to operate on optical flows.

- IO
- Visualization
- Flow warping

We provide two options to dump optical flow files: uncompressed and compressed.
The uncompressed way just dumps the floating numbers to a binary file. It is
lossless but the dumped file has a larger size.
The compressed way quantizes the optical flow to 0-255 and dumps it as a
jpeg image. The flow of x-dim and y-dim will be concatenated into a single image.

1. IO

```python
flow = np.random.rand(800, 600, 2).astype(np.float32)
# dump the flow to a flo file (~3.7M)
mmcv.flowwrite(flow, 'uncompressed.flo')
# dump the flow to a jpeg file (~230K)
# the shape of the dumped image is (800, 1200)
mmcv.flowwrite(flow, 'compressed.jpg', quantize=True, concat_axis=1)

# read the flow file, the shape of loaded flow is (800, 600, 2) for both ways
flow = mmcv.flowread('uncompressed.flo')
flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)
```

2. Visualization

It is possible to visualize optical flows with `mmcv.flowshow()`.

```python
mmcv.flowshow(flow)
```

![progress](../_static/flow_visualization.png)

3. Flow warping

```python
img1 = mmcv.imread('img1.jpg')
flow = mmcv.flowread('flow.flo')
warped_img2 = mmcv.flow_warp(img1, flow)
```

img1 (left) and img2 (right)

![raw images](../_static/flow_raw_images.png)

optical flow (img2 -> img1)

![optical flow](../_static/flow_img2toimg1.png)

warped image and difference with ground truth

![warped image](../_static/flow_warp_diff.png)


================================================
FILE: docs/en/understand_mmcv/data_transform.md
================================================
# Data Transformation

In the OpenMMLab algorithm library, dataset construction and data preparation are decoupled. Usually, the construction of the dataset only parses the dataset and records the basic information of each sample, while the data preparation is a series of data transformations including data loading, preprocessing, formatting, and other operations performed according to the basic information of the sample.

## Design of data transformation

In MMCV, we use various callable data transformation classes to manipulate data. These data transformation classes can accept several configuration parameters for the instantiation and then process the input data dictionary by `__call__` method. All data transformation methods accept a dictionary as the input and produce the output as a dictionary as well. A simple example is as follows:

```python
>>> import numpy as np
>>> from mmcv.transforms import Resize
>>>
>>> transform = Resize(scale=(224, 224))
>>> data_dict = {'img': np.random.rand(256, 256, 3)}
>>> data_dict = transform(data_dict)
>>> print(data_dict['img'].shape)
(224, 224, 3)
```

The data transformation class reads some fields of the input dictionary and may add or update some fields. The keys of these fields are mostly fixed. For example, `Resize` will always read fields such as `"img"` in the input dictionary. More information about the conventions for input and output fields could be found in the documentation of the corresponding class.

```{note}
By convention, the order of image shape which is used as **initialization parameters** in data transformation (such as Resize, Pad) is (width, height). In the dictionary returned by the data transformation, the image related shape, such as `img_shape`, `ori_shape`, `pad_shape`, etc., is (height, width).
```

MMCV provides a unified base class called `BaseTransform` for all data transformation classes:

```python
class BaseTransform(metaclass=ABCMeta):

    def __call__(self, results: dict) -> dict:

        return self.transform(results)

    @abstractmethod
    def transform(self, results: dict) -> dict:
        pass
```

All data transformation classes must inherit `BaseTransform` and implement the `transform` method. Both the input and output of the `transform` method are a dictionary. In the **Custom data transformation class** section, we will describe how to implement a data transformation class in more detail.

## Data pipeline

As mentioned above, the inputs and outputs of all data transformations are dictionaries. Moreover, according to the \[Convention on Datasets\] (TODO) in OpenMMLab, the basic information of each sample in the dataset is also a dictionary. This way, we can connect all data transformation operations end to end and combine them into a data pipeline. This pipeline inputs the information dictionary of the samples in the dataset and outputs the information dictionary after a series of processing.

Taking the classification task as an example, we show a typical data pipeline in the figure below. For each sample, the information stored in the dataset is a dictionary, as shown on the far left in the figure. After each data transformation operation represented by the blue block, a new field (marked in green) will be added to the data dictionary or an existing field (marked in orange) will be updated.

The data pipeline is a list of several data transformation configuration dictionaries in the configuration file. Each dataset needs to set the parameter `pipeline` to define the data preparation operations the dataset needs to perform. The configuration of the above data pipeline in the configuration file is as follows: ```python pipeline = [ dict(type='LoadImageFromFile'), dict(type='Resize', size=256, keep_ratio=True), dict(type='CenterCrop', crop_size=224), dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='ClsFormatBundle') ] dataset = dict( ... pipeline=pipeline, ... ) ``` ## Common data transformation classes The commonly used data transformation classes can be roughly divided into data loading, data preprocessing and augmentation, and data formatting. In MMCV, we provide some commonly used classes as follows: ### Data loading To support the loading of large-scale datasets, data is usually not loaded when `Dataset` is initialized. Only the corresponding path is loaded. Therefore, it is necessary to load specific data in the data pipeline. | Class | Feature | | :-------------------------: | :--------------------------------------------: | | [`LoadImageFromFile`](TODO) | Load from file path | | [`LoadAnnotations`](TODO) | Load and organize the annotations (bbox, etc.) | ### Data preprocessing and enhancement Data preprocessing and augmentation usually involve transforming the image itself, such as cropping, padding, scaling, etc. | Class | Feature | | :------------------------------: | :----------------------------------------------------: | | [`Pad`](TODO) | Padding | | [`CenterCrop`](TODO) | Center crop | | [`Normalize`](TODO) | Image normalization | | [`Resize`](TODO) | Resize to the specified size or ratio | | [`RandomResize`](TODO) | Scale the image randomly within the specified range | | [`RandomMultiscaleResize`](TODO) | Scale the image to a random size from multiple options | | [`RandomGrayscale`](TODO) | Random grayscale | | [`RandomFlip`](TODO) | Random flip | | [`MultiScaleFlipAug`](TODO) | Support scaling and flipping during the testing | ### Data formatting Data formatting operations are type conversions performed on the data. | Class | Feature | | :---------------------: | :------------------------------------------: | | [`ToTensor`](TODO) | Convert the specified data to `torch.Tensor` | | [`ImageToTensor`](TODO) | Convert the image to `torch.Tensor` | ## Customize data transformation classes To implement a new data transformation class, you must inherit `BaseTransform` and implement the `transform` method. Here, we use a simple flip transform (`MyFlip`) as an example: ```python import random import mmcv from mmcv.transforms import BaseTransform, TRANSFORMS @TRANSFORMS.register_module() class MyFlip(BaseTransform): def __init__(self, direction: str): super().__init__() self.direction = direction def transform(self, results: dict) -> dict: img = results['img'] results['img'] = mmcv.imflip(img, direction=self.direction) return results ``` Now, we can instantiate `MyFlip` as a callable object to handle our data dictionary. ```python import numpy as np transform = MyFlip(direction='horizontal') data_dict = {'img': np.random.rand(224, 224, 3)} data_dict = transform(data_dict) processed_img = data_dict['img'] ``` Alternatively, use `MyFlip` transform in the `pipeline` of the config file. ```python pipeline = [ ... dict(type='MyFlip', direction='horizontal'), ... ] ``` It should be noted that if you want to use it in the configuration file, you must ensure that the file where the `MyFlip` class is located can be imported at the runtime. ## Transform wrapper Transform wrappers are a special class of data transformations. They do not operate on images, labels or other information in the data dictionary by themselves. Instead, they enhance the behavior of data transformations defined in them. ### KeyMapper `KeyMapper` is used to map fields in the data dictionary. For example, image processing transforms usually get their values from the `"img"` field in the data dictionary. But sometimes we want these transforms to handle images in other fields in the data dictionary, such as the `"gt_img"` field. When used with registry and configuration file, the field map wrapper should be used as follows: ```python pipeline = [ ... dict(type='KeyMapper', mapping={ 'img': 'gt_img', # map "gt_img" to "img" 'mask': ..., # The "mask" field in the raw data is not used. That is, for wrapped data transformations, the "mask" field is not included in the data }, auto_remap=True, # remap "img" back to "gt_img" after the transformation transforms=[ # only need to specify "img" in `RandomFlip` dict(type='RandomFlip'), ]) ... ] ``` With `KeyMapper`, we don't need to consider various possible input field names in the `transform` method when we implement the data transformation class. We only need to deal with the default fields. ### RandomChoice and RandomApply `RandomChoice` is used to randomly select a data transformation pipeline from the given choices. With this wrapper, we can easily implement some data augmentation functions, such as AutoAugment. In configuration file, you can use `RandomChoice` as follows: ```python pipeline = [ ... dict(type='RandomChoice', transforms=[ [ dict(type='Posterize', bits=4), dict(type='Rotate', angle=30.) ], # the first combo option [ dict(type='Equalize'), dict(type='Rotate', angle=30) ], # the second combo option ], prob=[0.4, 0.6] # the prob of each combo ) ... ] ``` `RandomApply` is used to randomly perform a combination of data transformations with a specified probability. For example: ```python pipeline = [ ... dict(type='RandomApply', transforms=[dict(type='Rotate', angle=30.)], prob=0.3) # perform the transformation with prob as 0.3 ... ] ``` ### TransformBroadcaster Usually, a data transformation class only reads the target of an operation from one field. While we can also use `KeyMapper` to change the fields read, there is no way to apply transformations to the data of multiple fields at once. To achieve this, we need to use the multi-target extension wrapper `TransformBroadcaster`. `TransformBroadcaster` has two uses, one is to apply data transformation to multiple specified fields, and the other is to apply data transformation to a group of targets under a field. 1. Apply to multiple fields Suppose we need to apply a data transformation to images in two fields `"lq"` (low-quality) and `"gt"` (ground-truth). ```python pipeline = [ dict(type='TransformBroadcaster', # apply to the "lq" and "gt" fields respectively, and set the "img" field to both mapping={'img': ['lq', 'gt']}, # remap the "img" field back to the original field after the transformation auto_remap=True, # whether to share random variables in the transformation of each target # more introduction will be referred in the following chapters (random variable sharing) share_random_params=True, transforms=[ # only need to manipulate the "img" field in the `RandomFlip` class dict(type='RandomFlip'), ]) ] ``` In the `mapping` setting of the multi-target extension, we can also use `...` to ignore the specified original field. As shown in the following example, the wrapped `RandomCrop` will crop the image in the field `"img"` and update the size of the cropped image if the field `"img_shape"` exists. If we want to do the same random cropping for both image fields `"lq"` and `"gt"` at the same time but update the `"img_shape"` field only once, we can do it as in the example: ```python pipeline = [ dict(type='TransformBroadcaster', mapping={ 'img': ['lq', 'gt'], 'img_shape': ['img_shape', ...], }, # remap the "img" and "img_shape" fields back to their original fields after the transformation auto_remap=True, # whether to share random variables in the transformation of each target # more introduction will be referred in the following chapters (random variable sharing) share_random_params=True, transforms=[ # "img" and "img_shape" fields are manipulated in the `RandomCrop` class # if "img_shape" is missing, only operate on "img" dict(type='RandomCrop'), ]) ] ``` 2. A set of targets applied to a field Suppose we need to apply a data transformation to the `"images"` field, which is a list of images. ```python pipeline = [ dict(type='TransformBroadcaster', # map each image under the "images" field to the "img" field mapping={'img': 'images'}, # remap the images under the "img" field back to the list in the "images" field after the transformation auto_remap=True, # whether to share random variables in the transformation of each target share_random_params=True, transforms=[ # in the `RandomFlip` transformation class, we only need to manipulate the "img" field dict(type='RandomFlip'), ]) ] ``` #### Decorator `cache_randomness` In `TransformBroadcaster`, we provide the `share_random_params` option to support sharing random states across multiple data transformations. For example, in a super-resolution task, we want to apply **the same** random transformations **simultaneously** to the low-resolution image and the original image. If we use this function in a custom data transformation class, we need to mark which random variables support sharing in the class. This can be achieved with the decorator `cache_randomness`. Taking `MyFlip` from the above example, we want to perform flipping randomly with a certain probability: ```python from mmcv.transforms.utils import cache_randomness @TRANSFORMS.register_module() class MyRandomFlip(BaseTransform): def __init__(self, prob: float, direction: str): super().__init__() self.prob = prob self.direction = direction @cache_randomness # label the output of the method as a shareable random variable def do_flip(self): flip = True if random.random() > self.prob else False return flip def transform(self, results: dict) -> dict: img = results['img'] if self.do_flip(): results['img'] = mmcv.imflip(img, direction=self.direction) return results ``` In the above example, we decorate the `do_flip` method with `cache_randomness`, marking the method return value `flip` as a random variable that supports sharing. Therefore, in the transformation of `TransformBroadcaster` to multiple targets, the value of this variable will remain the same. #### Decorator `avoid_cache_randomness` In some cases, we cannot separate the process of generating random variables in data transformation into a class method. For example, modules from third-party libraries used in data transformation encapsulate the relevant parts of random variables inside, making them impossible to be extracted as class methods for data transformation. Such data transformations cannot support shared random variables through the decorator `cache_randomness` annotation, and thus cannot share random variables during multi-objective expansion. To avoid misuse of such data transformations in multi-object extensions, we provide another decorator, `avoid_cache_randomness`, to mark such data transformations: ```python from mmcv.transforms.utils import avoid_cache_randomness @TRANSFORMS.register_module() @avoid_cache_randomness class MyRandomTransform(BaseTransform): def transform(self, results: dict) -> dict: ... ``` Data transformation classes marked with `avoid_cache_randomness` will throw an exception when their instance is wrapped by `TransformBroadcaster` and the parameter `share_random_params` is set to True. This reminds the user not to use it in this way. There are a few things to keep in mind when using `avoid_cache_randomness`: 1. `avoid_cache_randomness` is only used to decorate data transformation classes (subclasses of `BaseTransfrom`) and cannot be used to decorate other general classes, class methods, or functions 2. When a data transformation decorated with `avoid_cache_randomness` is used as a base class, its subclasses **will not inherit** its feature. If the subclass is still unable to share random variables, `avoid_cache_randomness` should be used again. 3. A data transformation needs to be modified with `avoid_cache_randomness` only when a data transformation is random and cannot share its random parameters. Data transformations without randomness require no decoration ================================================ FILE: docs/en/understand_mmcv/ops.md ================================================ ## ops We implement common ops used in detection, segmentation, etc. | Device | CPU | CUDA | MLU | MPS | Ascend | | ---------------------------- | --- | ---- | --- | --- | ------ | | ActiveRotatedFilter | √ | √ | | | √ | | AssignScoreWithK | | √ | | | | | BallQuery | | √ | √ | | √ | | BBoxOverlaps | | √ | √ | √ | √ | | BorderAlign | | √ | | | | | BoxIouRotated | √ | √ | √ | | √ | | BoxIouQuadri | √ | √ | | | | | CARAFE | | √ | √ | | | | ChamferDistance | | √ | | | √ | | CrissCrossAttention | | √ | | | | | ContourExpand | √ | | | | | | ConvexIoU | | √ | | | | | CornerPool | | √ | | | | | Correlation | | √ | | | | | Deformable Convolution v1/v2 | √ | √ | √ | | √ | | Deformable RoIPool | | √ | √ | | √ | | DiffIoURotated | | √ | √ | | | | DynamicScatter | | √ | √ | | | | FurthestPointSample | | √ | | | | | FurthestPointSampleWithDist | | √ | | | | | FusedBiasLeakyrelu | | √ | | | √ | | GatherPoints | | √ | | | √ | | GroupPoints | | √ | | | | | Iou3d | | √ | √ | | | | KNN | | √ | | | | | MaskedConv | | √ | √ | | √ | | MergeCells | | √ | | | | | MinAreaPolygon | | √ | | | | | ModulatedDeformConv2d | √ | √ | √ | | √ | | MultiScaleDeformableAttn | | √ | √ | | √ | | NMS | √ | √ | √ | | √ | | NMSRotated | √ | √ | √ | | √ | | NMSQuadri | √ | √ | | | | | PixelGroup | √ | | | | | | PointsInBoxes | √ | √ | | | | | PointsInPolygons | | √ | | | √ | | PSAMask | √ | √ | √ | | √ | | RotatedFeatureAlign | √ | √ | √ | | √ | | RoIPointPool3d | | √ | √ | | | | RoIPool | | √ | √ | | √ | | RoIAlignRotated | √ | √ | √ | | √ | | RiRoIAlignRotated | | √ | | | | | RoIAlign | √ | √ | √ | | √ | | RoIAwarePool3d | | √ | √ | | | | SAConv2d | | √ | | | | | SigmoidFocalLoss | | √ | √ | | √ | | SoftmaxFocalLoss | | √ | | | √ | | SoftNMS | | √ | | | | | Sparse Convolution | | √ | √ | | | | Synchronized BatchNorm | | √ | | | | | ThreeInterpolate | | √ | | | | | ThreeNN | | √ | √ | | | | TINShift | | √ | √ | | | | UpFirDn2d | | √ | | | | | Voxelization | √ | √ | √ | | √ | | PrRoIPool | | √ | | | | | BezierAlign | √ | √ | | | | | BiasAct | | √ | | | | | FilteredLrelu | | √ | | | | | Conv2dGradfix | | √ | | | | ================================================ FILE: docs/en/understand_mmcv/visualization.md ================================================ ## Visualization `mmcv` can show images and annotations (currently supported types include bounding boxes). ```python # show an image file mmcv.imshow('a.jpg') # show a loaded image img = np.random.rand(100, 100, 3) mmcv.imshow(img) # show image with bounding boxes img = np.random.rand(100, 100, 3) bboxes = np.array([[0, 0, 50, 50], [20, 20, 60, 60]]) mmcv.imshow_bboxes(img, bboxes) ``` `mmcv` can also visualize special images such as optical flows. ```python flow = mmcv.flowread('test.flo') mmcv.flowshow(flow) ``` ================================================ FILE: docs/zh_cn/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/zh_cn/_static/css/readthedocs.css ================================================ .header-logo { background-image: url("../image/mmcv-logo.png"); background-size: 85px 40px; height: 40px; width: 85px; } table.colwidths-auto td { width: 50% } ================================================ FILE: docs/zh_cn/_static/version.json ================================================ { "Linux": [ { "cuda": "12.1", "torch": "2.4.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "11.8", "torch": "2.4.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "11.8", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "11.7", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "11.7", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "11.6", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "11.6", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.5", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.0", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.2", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.2", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.2", "torch": "1.5.x", "mmcv": [ "2.0.0rc3" ] }, { "cuda": "10.1", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.1", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.1", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.1", "torch": "1.5.x", "mmcv": [ "2.0.0rc3" ] }, { "cuda": "9.2", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "9.2", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "9.2", "torch": "1.5.x", "mmcv": [ "2.0.0rc3", "2.0.0rc2" ] }, { "cuda": "cpu", "torch": "2.4.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "cpu", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "cpu", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "cpu", "torch": "1.5.x", "mmcv": [ "2.0.0rc3", "2.0.0rc2" ] } ], "Windows": [ { "cuda": "12.1", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "12.1", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "11.8", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "11.8", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "11.8", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "11.7", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "11.7", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "11.6", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "11.6", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.5", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.3", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "11.1", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.2", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3" ] }, { "cuda": "10.2", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "10.1", "torch": "1.8.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "10.1", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3" ] }, { "cuda": "10.1", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "cpu", "torch": "2.3.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.2.x", "mmcv": [ "2.2.0" ] }, { "cuda": "cpu", "torch": "2.1.x", "mmcv": [ "2.2.0", "2.1.0" ] }, { "cuda": "cpu", "torch": "2.0.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.13.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.12.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.11.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.10.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.9.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.8.x", "mmcv": [ "2.2.0", "2.1.0", "2.0.1", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] }, { "cuda": "cpu", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0rc1" ] } ], "macOS": [ { "cuda": "cpu", "torch": "2.1.x", "mmcv": [ "2.1.0" ] }, { "cuda": "cpu", "torch": "2.0.x", "mmcv": [ "2.1.0", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.13.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0" ] }, { "cuda": "mps", "torch": "1.13.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3" ] }, { "cuda": "cpu", "torch": "1.12.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.11.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.10.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.9.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.8.x", "mmcv": [ "2.1.0", "2.0.0rc4", "2.0.0rc3", "2.0.0rc2", "2.0.0" ] }, { "cuda": "cpu", "torch": "1.7.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2" ] }, { "cuda": "cpu", "torch": "1.6.x", "mmcv": [ "2.0.0rc4", "2.0.0rc3", "2.0.0rc2" ] } ] } ================================================ FILE: docs/zh_cn/_templates/classtemplate.rst ================================================ .. role:: hidden :class: hidden-section .. currentmodule:: {{ module }} {{ name | underline}} .. autoclass:: {{ name }} :members: .. autogenerated from source/_templates/classtemplate.rst note it does not have :inherited-members: ================================================ FILE: docs/zh_cn/api/arraymisc.rst ================================================ .. role:: hidden :class: hidden-section mmcv.arraymisc =================================== .. contents:: mmcv.arraymisc :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.arraymisc .. autosummary:: :toctree: generated :nosignatures: quantize dequantize ================================================ FILE: docs/zh_cn/api/cnn.rst ================================================ .. role:: hidden :class: hidden-section mmcv.cnn =================================== .. contents:: mmcv.cnn :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.cnn Module ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst ContextBlock Conv2d Conv3d ConvAWS2d ConvModule ConvTranspose2d ConvTranspose3d ConvWS2d DepthwiseSeparableConvModule GeneralizedAttention HSigmoid HSwish LayerScale Linear MaxPool2d MaxPool3d NonLocal1d NonLocal2d NonLocal3d Scale Swish Conv2dRFSearchOp Build Function ---------------- .. autosummary:: :toctree: generated :nosignatures: build_activation_layer build_conv_layer build_norm_layer build_padding_layer build_plugin_layer build_upsample_layer Miscellaneous ---------------- .. autosummary:: :toctree: generated :nosignatures: fuse_conv_bn conv_ws_2d is_norm make_res_layer make_vgg_layer get_model_complexity_info ================================================ FILE: docs/zh_cn/api/image.rst ================================================ .. role:: hidden :class: hidden-section mmcv.image =================================== .. contents:: mmcv.image :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.image IO ---------------- .. autosummary:: :toctree: generated :nosignatures: imfrombytes imread imwrite use_backend Color Space ---------------- .. autosummary:: :toctree: generated :nosignatures: bgr2gray bgr2hls bgr2hsv bgr2rgb bgr2ycbcr gray2bgr gray2rgb hls2bgr hsv2bgr imconvert rgb2bgr rgb2gray rgb2ycbcr ycbcr2bgr ycbcr2rgb Geometric ---------------- .. autosummary:: :toctree: generated :nosignatures: cutout imcrop imflip impad impad_to_multiple imrescale imresize imresize_like imresize_to_multiple imrotate imshear imtranslate rescale_size Photometric ---------------- .. autosummary:: :toctree: generated :nosignatures: adjust_brightness adjust_color adjust_contrast adjust_hue adjust_lighting adjust_sharpness auto_contrast clahe imdenormalize imequalize iminvert imnormalize lut_transform posterize solarize Miscellaneous ---------------- .. autosummary:: :toctree: generated :nosignatures: tensor2imgs ================================================ FILE: docs/zh_cn/api/ops.rst ================================================ .. role:: hidden :class: hidden-section mmcv.ops =================================== .. contents:: mmcv.ops :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.ops .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst BorderAlign CARAFE CARAFENaive CARAFEPack Conv2d ConvTranspose2d CornerPool Correlation CrissCrossAttention DeformConv2d DeformConv2dPack DeformRoIPool DeformRoIPoolPack DynamicScatter FusedBiasLeakyReLU GroupAll Linear MaskedConv2d MaxPool2d ModulatedDeformConv2d ModulatedDeformConv2dPack ModulatedDeformRoIPoolPack MultiScaleDeformableAttention PSAMask PointsSampler PrRoIPool QueryAndGroup RiRoIAlignRotated RoIAlign RoIAlignRotated RoIAwarePool3d RoIPointPool3d RoIPool SAConv2d SigmoidFocalLoss SimpleRoIAlign SoftmaxFocalLoss SparseConv2d SparseConv3d SparseConvTensor SparseConvTranspose2d SparseConvTranspose3d SparseInverseConv2d SparseInverseConv3d SparseMaxPool2d SparseMaxPool3d SparseModule SparseSequential SubMConv2d SubMConv3d SyncBatchNorm TINShift Voxelization .. autosummary:: :toctree: generated :nosignatures: active_rotated_filter assign_score_withk ball_query batched_nms bbox_overlaps border_align box_iou_rotated boxes_iou3d boxes_iou_bev boxes_overlap_bev carafe carafe_naive chamfer_distance contour_expand convex_giou convex_iou deform_conv2d deform_roi_pool diff_iou_rotated_2d diff_iou_rotated_3d dynamic_scatter furthest_point_sample furthest_point_sample_with_dist fused_bias_leakyrelu gather_points grouping_operation knn masked_conv2d min_area_polygons modulated_deform_conv2d nms nms3d nms3d_normal nms_bev nms_match nms_normal_bev nms_rotated pixel_group point_sample points_in_boxes_all points_in_boxes_cpu points_in_boxes_part points_in_polygons prroi_pool rel_roi_point_to_rel_img_point riroi_align_rotated roi_align roi_align_rotated roi_pool rotated_feature_align scatter_nd sigmoid_focal_loss soft_nms softmax_focal_loss three_interpolate three_nn tin_shift upfirdn2d voxelization ================================================ FILE: docs/zh_cn/api/transforms.rst ================================================ .. role:: hidden :class: hidden-section mmcv.transforms =================================== .. currentmodule:: mmcv.transforms .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst BaseTransform TestTimeAug Loading ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst LoadAnnotations LoadImageFromFile Processing ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst CenterCrop MultiScaleFlipAug Normalize Pad RandomChoiceResize RandomFlip RandomGrayscale RandomResize Resize ToTensor ImageToTensor Wrapper ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst Compose KeyMapper RandomApply RandomChoice TransformBroadcaster ================================================ FILE: docs/zh_cn/api/utils.rst ================================================ .. role:: hidden :class: hidden-section mmcv.utils =================================== .. contents:: mmcv.utils :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.utils .. autosummary:: :toctree: generated :nosignatures: IS_CUDA_AVAILABLE IS_MLU_AVAILABLE IS_MPS_AVAILABLE collect_env jit skip_no_elena ================================================ FILE: docs/zh_cn/api/video.rst ================================================ .. role:: hidden :class: hidden-section mmcv.video =================================== .. contents:: mmcv.video :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.video IO ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst VideoReader Cache .. autosummary:: :toctree: generated :nosignatures: frames2video Optical Flow ---------------- .. autosummary:: :toctree: generated :nosignatures: dequantize_flow flow_from_bytes flow_warp flowread flowwrite quantize_flow sparse_flow_from_bytes Video Processing ---------------- .. autosummary:: :toctree: generated :nosignatures: concat_video convert_video cut_video resize_video ================================================ FILE: docs/zh_cn/api/visualization.rst ================================================ .. role:: hidden :class: hidden-section mmcv.visualization =================================== .. contents:: mmcv.visualization :depth: 2 :local: :backlinks: top .. currentmodule:: mmcv.visualization Color ---------------- .. autosummary:: :toctree: generated :nosignatures: :template: classtemplate.rst Color .. autosummary:: :toctree: generated :nosignatures: color_val Image ---------------- .. autosummary:: :toctree: generated :nosignatures: imshow imshow_bboxes imshow_det_bboxes Optical Flow ---------------- .. autosummary:: :toctree: generated :nosignatures: flow2rgb flowshow make_color_wheel ================================================ FILE: docs/zh_cn/community/code_style.md ================================================ ## 代码规范 ### 代码规范标准 #### PEP 8 —— Python 官方代码规范 [Python 官方的代码风格指南](https://www.python.org/dev/peps/pep-0008/),包含了以下几个方面的内容: - 代码布局,介绍了 Python 中空行、断行以及导入相关的代码风格规范。比如一个常见的问题:当我的代码较长,无法在一行写下时,何处可以断行? - 表达式,介绍了 Python 中表达式空格相关的一些风格规范。 - 尾随逗号相关的规范。当列表较长,无法一行写下而写成如下逐行列表时,推荐在末项后加逗号,从而便于追加选项、版本控制等。 ```python # Correct: FILES = ['setup.cfg', 'tox.ini'] # Correct: FILES = [ 'setup.cfg', 'tox.ini', ] # Wrong: FILES = ['setup.cfg', 'tox.ini',] # Wrong: FILES = [ 'setup.cfg', 'tox.ini' ] ``` - 命名相关规范、注释相关规范、类型注解相关规范,我们将在后续章节中做详细介绍。 "A style guide is about consistency. Consistency with this style guide is important. Consistency within a project is more important. Consistency within one module or function is the most important." PEP 8 -- Style Guide for Python Code :::{note} PEP 8 的代码规范并不是绝对的,项目内的一致性要优先于 PEP 8 的规范。OpenMMLab 各个项目都在 setup.cfg 设定了一些代码规范的设置,请遵照这些设置。一个例子是在 PEP 8 中有如下一个例子: ```python # Correct: hypot2 = x*x + y*y # Wrong: hypot2 = x * x + y * y ``` 这一规范是为了指示不同优先级,但 OpenMMLab 的设置中通常没有启用 yapf 的 `ARITHMETIC_PRECEDENCE_INDICATION` 选项,因而格式规范工具不会按照推荐样式格式化,以设置为准。 ::: #### Google 开源项目风格指南 [Google 使用的编程风格指南](https://google.github.io/styleguide/pyguide.html),包括了 Python 相关的章节。相较于 PEP 8,该指南提供了更为详尽的代码指南。该指南包括了语言规范和风格规范两个部分。 其中,语言规范对 Python 中很多语言特性进行了优缺点的分析,并给出了使用指导意见,如异常、Lambda 表达式、列表推导式、metaclass 等。 风格规范的内容与 PEP 8 较为接近,大部分约定建立在 PEP 8 的基础上,也有一些更为详细的约定,如函数长度、TODO 注释、文件与 socket 对象的访问等。 推荐将该指南作为参考进行开发,但不必严格遵照,一来该指南存在一些 Python 2 兼容需求,例如指南中要求所有无基类的类应当显式地继承 Object, 而在仅使用 Python 3 的环境中,这一要求是不必要的,依本项目中的惯例即可。二来 OpenMMLab 的项目作为框架级的开源软件,不必对一些高级技巧过于避讳,尤其是 MMCV。但尝试使用这些技巧前应当认真考虑是否真的有必要,并寻求其他开发人员的广泛评估。 另外需要注意的一处规范是关于包的导入,在该指南中,要求导入本地包时必须使用路径全称,且导入的每一个模块都应当单独成行,通常这是不必要的,而且也不符合目前项目的开发惯例,此处进行如下约定: ```python # Correct from mmcv.cnn.bricks import (Conv2d, build_norm_layer, DropPath, MaxPool2d, Linear) from ..utils import ext_loader # Wrong from mmcv.cnn.bricks import Conv2d, build_norm_layer, DropPath, MaxPool2d, \ Linear # 使用括号进行连接,而不是反斜杠 from ...utils import is_str # 最多向上回溯一层,过多的回溯容易导致结构混乱 ``` OpenMMLab 项目使用 pre-commit 工具自动格式化代码,详情见[贡献代码](./contributing.md#代码风格)。 ### 命名规范 #### 命名规范的重要性 优秀的命名是良好代码可读的基础。基础的命名规范对各类变量的命名做了要求,使读者可以方便地根据代码名了解变量是一个类 / 局部变量 / 全局变量等。而优秀的命名则需要代码作者对于变量的功能有清晰的认识,以及良好的表达能力,从而使读者根据名称就能了解其含义,甚至帮助了解该段代码的功能。 #### 基础命名规范 | 类型 | 公有 | 私有 | | --------------- | ---------------- | ------------------ | | 模块 | lower_with_under | \_lower_with_under | | 包 | lower_with_under | | | 类 | CapWords | \_CapWords | | 异常 | CapWordsError | | | 函数(方法) | lower_with_under | \_lower_with_under | | 函数 / 方法参数 | lower_with_under | | | 全局 / 类内常量 | CAPS_WITH_UNDER | \_CAPS_WITH_UNDER | | 全局 / 类内变量 | lower_with_under | \_lower_with_under | | 变量 | lower_with_under | \_lower_with_under | | 局部变量 | lower_with_under | | 注意: - 尽量避免变量名与保留字冲突,特殊情况下如不可避免,可使用一个后置下划线,如 class\_ - 尽量不要使用过于简单的命名,除了约定俗成的循环变量 i,文件变量 f,错误变量 e 等。 - 不会被用到的变量可以命名为 \_,逻辑检查器会将其忽略。 #### 命名技巧 良好的变量命名需要保证三点: 1. 含义准确,没有歧义 2. 长短适中 3. 前后统一 ```python # Wrong class Masks(metaclass=ABCMeta): # 命名无法表现基类;Instance or Semantic? pass # Correct class BaseInstanceMasks(metaclass=ABCMeta): pass # Wrong,不同地方含义相同的变量尽量用统一的命名 def __init__(self, inplanes, planes): pass def __init__(self, in_channels, out_channels): pass ``` 常见的函数命名方法: - 动宾命名法:crop_img, init_weights - 动宾倒置命名法:imread, bbox_flip 注意函数命名与参数的顺序,保证主语在前,符合语言习惯: - check_keys_exist(key, container) - check_keys_contain(container, key) 注意避免非常规或统一约定的缩写,如 nb -> num_blocks,in_nc -> in_channels ### docstring 规范 #### 为什么要写 docstring docstring 是对一个类、一个函数功能与 API 接口的详细描述,有两个功能,一是帮助其他开发者了解代码功能,方便 debug 和复用代码;二是在 Readthedocs 文档中自动生成相关的 API reference 文档,帮助不了解源代码的社区用户使用相关功能。 #### 如何写 docstring 与注释不同,一份规范的 docstring 有着严格的格式要求,以便于 Python 解释器以及 sphinx 进行文档解析,详细的 docstring 约定参见 [PEP 257](https://www.python.org/dev/peps/pep-0257/)。此处以例子的形式介绍各种文档的标准格式,参考格式为 [Google 风格](https://zh-google-styleguide.readthedocs.io/en/latest/google-python-styleguide/python_style_rules/#comments)。 1. 模块文档 代码风格规范推荐为每一个模块(即 Python 文件)编写一个 docstring,但目前 OpenMMLab 项目大部分没有此类 docstring,因此不做硬性要求。 ```python """A one line summary of the module or program, terminated by a period. Leave one blank line. The rest of this docstring should contain an overall description of the module or program. Optionally, it may also contain a brief description of exported classes and functions and/or usage examples. Typical usage example: foo = ClassFoo() bar = foo.FunctionBar() """ ``` 2. 类文档 类文档是我们最常需要编写的,此处,按照 OpenMMLab 的惯例,我们使用了与 Google 风格不同的写法。如下例所示,文档中没有使用 Attributes 描述类属性,而是使用 Args 描述 __init__ 函数的参数。 在 Args 中,遵照 `parameter (type): Description.` 的格式,描述每一个参数类型和功能。其中,多种类型可使用 `(float or str)` 的写法,可以为 None 的参数可以写为 `(int, optional)`。 ```python class BaseRunner(metaclass=ABCMeta): """The base class of Runner, a training helper for PyTorch. All subclasses should implement the following APIs: - ``run()`` - ``train()`` - ``val()`` - ``save_checkpoint()`` Args: model (:obj:`torch.nn.Module`): The model to be run. batch_processor (callable, optional): A callable method that process a data batch. The interface of this method should be ``batch_processor(model, data, train_mode) -> dict``. Defaults to None. optimizer (dict or :obj:`torch.optim.Optimizer`, optional): It can be either an optimizer (in most cases) or a dict of optimizers (in models that requires more than one optimizer, e.g., GAN). Defaults to None. work_dir (str, optional): The working directory to save checkpoints and logs. Defaults to None. logger (:obj:`logging.Logger`): Logger used during training. Defaults to None. (The default value is just for backward compatibility) meta (dict, optional): A dict records some import information such as environment info and seed, which will be logged in logger hook. Defaults to None. max_epochs (int, optional): Total training epochs. Defaults to None. max_iters (int, optional): Total training iterations. Defaults to None. """ def __init__(self, model, batch_processor=None, optimizer=None, work_dir=None, logger=None, meta=None, max_iters=None, max_epochs=None): ... ``` 另外,在一些算法实现的主体类中,建议加入原论文的链接;如果参考了其他开源代码的实现,则应加入 modified from,而如果是直接复制了其他代码库的实现,则应加入 copied from ,并注意源码的 License。如有必要,也可以通过 .. math:: 来加入数学公式 ```python # 参考实现 # This func is modified from `detectron2 # `_. # 复制代码 # This code was copied from the `ubelt # library`_. # 引用论文 & 添加公式 class LabelSmoothLoss(nn.Module): r"""Initializer for the label smoothed cross entropy loss. Refers to `Rethinking the Inception Architecture for Computer Vision `_. This decreases gap between output scores and encourages generalization. Labels provided to forward can be one-hot like vectors (NxC) or class indices (Nx1). And this accepts linear combination of one-hot like labels from mixup or cutmix except multi-label task. Args: label_smooth_val (float): The degree of label smoothing. num_classes (int, optional): Number of classes. Defaults to None. mode (str): Refers to notes, Options are "original", "classy_vision", "multi_label". Defaults to "classy_vision". reduction (str): The method used to reduce the loss. Options are "none", "mean" and "sum". Defaults to 'mean'. loss_weight (float): Weight of the loss. Defaults to 1.0. Note: if the ``mode`` is "original", this will use the same label smooth method as the original paper as: .. math:: (1-\epsilon)\delta_{k, y} + \frac{\epsilon}{K} where :math:`\epsilon` is the ``label_smooth_val``, :math:`K` is the ``num_classes`` and :math:`\delta_{k,y}` is Dirac delta, which equals 1 for k=y and 0 otherwise. if the ``mode`` is "classy_vision", this will use the same label smooth method as the `facebookresearch/ClassyVision `_ repo as: .. math:: \frac{\delta_{k, y} + \epsilon/K}{1+\epsilon} if the ``mode`` is "multi_label", this will accept labels from multi-label task and smoothing them as: .. math:: (1-2\epsilon)\delta_{k, y} + \epsilon ``` ```{note} 注意 \`\`here\`\`、\`here\`、"here" 三种引号功能是不同。 在 reStructured 语法中,\`\`here\`\` 表示一段代码;\`here\` 表示斜体;"here" 无特殊含义,一般可用来表示字符串。其中 \`here\` 的用法与 Markdown 中不同,需要多加留意。 另外还有 :obj:\`type\` 这种更规范的表示类的写法,但鉴于长度,不做特别要求,一般仅用于表示非常用类型。 ``` 3. 方法(函数)文档 函数文档与类文档的结构基本一致,但需要加入返回值文档。对于较为复杂的函数和类,可以使用 Examples 字段加入示例;如果需要对参数加入一些较长的备注,可以加入 Note 字段进行说明。 对于使用较为复杂的类或函数,比起看大段大段的说明文字和参数文档,添加合适的示例更能帮助用户迅速了解其用法。需要注意的是,这些示例最好是能够直接在 Python 交互式环境中运行的,并给出一些相对应的结果。如果存在多个示例,可以使用注释简单说明每段示例,也能起到分隔作用。 ```python def import_modules_from_strings(imports, allow_failed_imports=False): """Import modules from the given list of strings. Args: imports (list | str | None): The given module names to be imported. allow_failed_imports (bool): If True, the failed imports will return None. Otherwise, an ImportError is raise. Defaults to False. Returns: List[module] | module | None: The imported modules. All these three lines in docstring will be compiled into the same line in readthedocs. Examples: >>> osp, sys = import_modules_from_strings( ... ['os.path', 'sys']) >>> import os.path as osp_ >>> import sys as sys_ >>> assert osp == osp_ >>> assert sys == sys_ """ ... ``` 如果函数接口在某个版本发生了变化,需要在 docstring 中加入相关的说明,必要时添加 Note 或者 Warning 进行说明,例如: ```python class CheckpointHook(Hook): """Save checkpoints periodically. Args: out_dir (str, optional): The root directory to save checkpoints. If not specified, ``runner.work_dir`` will be used by default. If specified, the ``out_dir`` will be the concatenation of ``out_dir`` and the last level directory of ``runner.work_dir``. Defaults to None. `Changed in version 1.3.15.` file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmcv.fileio.FileClient` for details. Defaults to None. `New in version 1.3.15.` Warning: Before v1.3.15, the ``out_dir`` argument indicates the path where the checkpoint is stored. However, in v1.3.15 and later, ``out_dir`` indicates the root directory and the final path to save checkpoint is the concatenation of out_dir and the last level directory of ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A" and the value of ``runner.work_dir`` is "/path/of/B", then the final path will be "/path/of/A/B". ``` 如果参数或返回值里带有需要展开描述字段的 dict,则应该采用如下格式: ```python def func(x): r""" Args: x (None): A dict with 2 keys, ``padded_targets``, and ``targets``. - ``targets`` (list[Tensor]): A list of tensors. Each tensor has the shape of :math:`(T_i)`. Each element is the index of a character. - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`. Each item is the length of a word. Returns: dict: A dict with 2 keys, ``padded_targets``, and ``targets``. - ``targets`` (list[Tensor]): A list of tensors. Each tensor has the shape of :math:`(T_i)`. Each element is the index of a character. - ``padded_targets`` (Tensor): A tensor of shape :math:`(N)`. Each item is the length of a word. """ return x ``` ```{important} 为了生成 readthedocs 文档,文档的编写需要按照 ReStructrued 文档格式,否则会产生文档渲染错误,在提交 PR 前,最好生成并预览一下文档效果。 语法规范参考: - [reStructuredText Primer - Sphinx documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#) - [Example Google Style Python Docstrings ‒ napoleon 0.7 documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html#example-google) ``` ### 注释规范 #### 为什么要写注释 对于一个开源项目,团队合作以及社区之间的合作是必不可少的,因而尤其要重视合理的注释。不写注释的代码,很有可能过几个月自己也难以理解,造成额外的阅读和修改成本。 #### 如何写注释 最需要写注释的是代码中那些技巧性的部分。如果你在下次代码审查的时候必须解释一下,那么你应该现在就给它写注释。对于复杂的操作,应该在其操作开始前写上若干行注释。对于不是一目了然的代码,应在其行尾添加注释。 —— Google 开源项目风格指南 ```python # We use a weighted dictionary search to find out where i is in # the array. We extrapolate position based on the largest num # in the array and the array size and then do binary search to # get the exact number. if i & (i-1) == 0: # True if i is 0 or a power of 2. ``` 为了提高可读性, 注释应该至少离开代码2个空格. 另一方面, 绝不要描述代码. 假设阅读代码的人比你更懂Python, 他只是不知道你的代码要做什么. —— Google 开源项目风格指南 ```python # Wrong: # Now go through the b array and make sure whenever i occurs # the next element is i+1 # Wrong: if i & (i-1) == 0: # True if i bitwise and i-1 is 0. ``` 在注释中,可以使用 Markdown 语法,因为开发人员通常熟悉 Markdown 语法,这样可以便于交流理解,如可使用单反引号表示代码和变量(注意不要和 docstring 中的 ReStructured 语法混淆) ```python # `_reversed_padding_repeated_twice` is the padding to be passed to # `F.pad` if needed (e.g., for non-zero padding types that are # implemented as two ops: padding + conv). `F.pad` accepts paddings in # reverse order than the dimension. self._reversed_padding_repeated_twice = _reverse_repeat_tuple(self.padding, 2) ``` #### 注释示例 1. 出自 `mmcv/utils/registry.py`,对于较为复杂的逻辑结构,通过注释,明确了优先级关系。 ```python # self.build_func will be set with the following priority: # 1. build_func # 2. parent.build_func # 3. build_from_cfg if build_func is None: if parent is not None: self.build_func = parent.build_func else: self.build_func = build_from_cfg else: self.build_func = build_func ``` 2. 出自 `mmcv/runner/checkpoint.py`,对于 bug 修复中的一些特殊处理,可以附带相关的 issue 链接,帮助其他人了解 bug 背景。 ```python def _save_ckpt(checkpoint, file): # The 1.6 release of PyTorch switched torch.save to use a new # zipfile-based file format. It will cause RuntimeError when a # checkpoint was saved in high version (PyTorch version>=1.6.0) but # loaded in low version (PyTorch version<1.6.0). More details at # https://github.com/open-mmlab/mmpose/issues/904 if digit_version(TORCH_VERSION) >= digit_version('1.6.0'): torch.save(checkpoint, file, _use_new_zipfile_serialization=False) else: torch.save(checkpoint, file) ``` ### 类型注解 #### 为什么要写类型注解 类型注解是对函数中变量的类型做限定或提示,为代码的安全性提供保障、增强代码的可读性、避免出现类型相关的错误。 Python 没有对类型做强制限制,类型注解只起到一个提示作用,通常你的 IDE 会解析这些类型注解,然后在你调用相关代码时对类型做提示。另外也有类型注解检查工具,这些工具会根据类型注解,对代码中可能出现的问题进行检查,减少 bug 的出现。 需要注意的是,通常我们不需要注释模块中的所有函数: 1. 公共的 API 需要注释 2. 在代码的安全性,清晰性和灵活性上进行权衡是否注释 3. 对于容易出现类型相关的错误的代码进行注释 4. 难以理解的代码请进行注释 5. 若代码中的类型已经稳定,可以进行注释. 对于一份成熟的代码,多数情况下,即使注释了所有的函数,也不会丧失太多的灵活性. #### 如何写类型注解 1. 函数 / 方法类型注解,通常不对 self 和 cls 注释。 ```python from typing import Optional, List, Tuple # 全部位于一行 def my_method(self, first_var: int) -> int: pass # 另起一行 def my_method( self, first_var: int, second_var: float) -> Tuple[MyLongType1, MyLongType1, MyLongType1]: pass # 单独成行(具体的应用场合与行宽有关,建议结合 yapf 自动化格式使用) def my_method( self, first_var: int, second_var: float ) -> Tuple[MyLongType1, MyLongType1, MyLongType1]: pass # 引用尚未被定义的类型 class MyClass: def __init__(self, stack: List["MyClass"]) -> None: pass ``` 注:类型注解中的类型可以是 Python 内置类型,也可以是自定义类,还可以使用 Python 提供的 wrapper 类对类型注解进行装饰,一些常见的注解如下: ```python # 数值类型 from numbers import Number # 可选类型,指参数可以为 None from typing import Optional def foo(var: Optional[int] = None): pass # 联合类型,指同时接受多种类型 from typing import Union def foo(var: Union[float, str]): pass from typing import Sequence # 序列类型 from typing import Iterable # 可迭代类型 from typing import Any # 任意类型 from typing import Callable # 可调用类型 from typing import List, Dict # 列表和字典的泛型类型 from typing import Tuple # 元组的特殊格式 # 虽然在 Python 3.9 中,list, tuple 和 dict 本身已支持泛型,但为了支持之前的版本 # 我们在进行类型注解时还是需要使用 List, Tuple, Dict 类型 # 另外,在对参数类型进行注解时,尽量使用 Sequence & Iterable & Mapping # List, Tuple, Dict 主要用于返回值类型注解 # 参见 https://docs.python.org/3/library/typing.html#typing.List ``` 2. 变量类型注解,一般用于难以直接推断其类型时 ```python # Recommend: 带类型注解的赋值 a: Foo = SomeUndecoratedFunction() a: List[int]: [1, 2, 3] # List 只支持单一类型泛型,可使用 Union b: Tuple[int, int] = (1, 2) # 长度固定为 2 c: Tuple[int, ...] = (1, 2, 3) # 变长 d: Dict[str, int] = {'a': 1, 'b': 2} # Not Recommend:行尾类型注释 # 虽然这种方式被写在了 Google 开源指南中,但这是一种为了支持 Python 2.7 版本 # 而补充的注释方式,鉴于我们只支持 Python 3, 为了风格统一,不推荐使用这种方式。 a = SomeUndecoratedFunction() # type: Foo a = [1, 2, 3] # type: List[int] b = (1, 2, 3) # type: Tuple[int, ...] c = (1, "2", 3.5) # type: Tuple[int, Text, float] ``` 3. 泛型 上文中我们知道,typing 中提供了 list 和 dict 的泛型类型,那么我们自己是否可以定义类似的泛型呢? ```python from typing import TypeVar, Generic KT = TypeVar('KT') VT = TypeVar('VT') class Mapping(Generic[KT, VT]): def __init__(self, data: Dict[KT, VT]): self._data = data def __getitem__(self, key: KT) -> VT: return self._data[key] ``` 使用上述方法,我们定义了一个拥有泛型能力的映射类,实际用法如下: ```python mapping = Mapping[str, float]({'a': 0.5}) value: float = example['a'] ``` 另外,我们也可以利用 TypeVar 在函数签名中指定联动的多个类型: ```python from typing import TypeVar, List T = TypeVar('T') # Can be anything A = TypeVar('A', str, bytes) # Must be str or bytes def repeat(x: T, n: int) -> List[T]: """Return a list containing n references to x.""" return [x]*n def longest(x: A, y: A) -> A: """Return the longest of two strings.""" return x if len(x) >= len(y) else y ``` 更多关于类型注解的写法请参考 [typing](https://docs.python.org/3/library/typing.html)。 #### 类型注解检查工具 [mypy](https://mypy.readthedocs.io/en/stable/) 是一个 Python 静态类型检查工具。根据你的类型注解,mypy 会检查传参、赋值等操作是否符合类型注解,从而避免可能出现的 bug。 例如如下的一个 Python 脚本文件 test.py: ```python def foo(var: int) -> float: return float(var) a: str = foo('2.0') b: int = foo('3.0') # type: ignore ``` 运行 mypy test.py 可以得到如下检查结果,分别指出了第 4 行在函数调用和返回值赋值两处类型错误。而第 5 行同样存在两个类型错误,由于使用了 type: ignore 而被忽略了,只有部分特殊情况可能需要此类忽略。 ``` test.py:4: error: Incompatible types in assignment (expression has type "float", variable has type "int") test.py:4: error: Argument 1 to "foo" has incompatible type "str"; expected "int" Found 2 errors in 1 file (checked 1 source file) ``` ================================================ FILE: docs/zh_cn/community/contributing.md ================================================ ## 贡献代码 欢迎加入 MMCV 社区,我们致力于打造最前沿的计算机视觉基础库,我们欢迎任何类型的贡献,包括但不限于 **修复错误** 修复代码实现错误的步骤如下: 1. 如果提交的代码改动较大,建议先提交 issue,并正确描述 issue 的现象、原因和复现方式,讨论后确认修复方案。 2. 修复错误并补充相应的单元测试,提交拉取请求。 **新增功能或组件** 1. 如果新功能或模块涉及较大的代码改动,建议先提交 issue,确认功能的必要性。 2. 实现新增功能并添单元测试,提交拉取请求。 **文档补充** 修复文档可以直接提交拉取请求 添加文档或将文档翻译成其他语言步骤如下 1. 提交 issue,确认添加文档的必要性。 2. 添加文档,提交拉取请求。 ### 拉取请求工作流 如果你对拉取请求不了解,没关系,接下来的内容将会从零开始,一步一步地指引你如何创建一个拉取请求。如果你想深入了解拉取请求的开发模式,可以参考 github [官方文档](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) #### 1. 复刻仓库 当你第一次提交拉取请求时,先复刻 OpenMMLab 原代码库,点击 GitHub 页面右上角的 **Fork** 按钮,复刻后的代码库将会出现在你的 GitHub 个人主页下。 将代码克隆到本地 ```shell git clone git@github.com:{username}/mmcv.git ``` 添加原代码库为上游代码库 ```bash git remote add upstream git@github.com:open-mmlab/mmcv ``` 检查 remote 是否添加成功,在终端输入 `git remote -v` ```bash origin git@github.com:{username}/mmcv.git (fetch) origin git@github.com:{username}/mmcv.git (push) upstream git@github.com:open-mmlab/mmcv (fetch) upstream git@github.com:open-mmlab/mmcv (push) ``` ```{note} 这里对 origin 和 upstream 进行一个简单的介绍,当我们使用 git clone 来克隆代码时,会默认创建一个 origin 的 remote,它指向我们克隆的代码库地址,而 upstream 则是我们自己添加的,用来指向原始代码库地址。当然如果你不喜欢他叫 upstream,也可以自己修改,比如叫 open-mmlab。我们通常向 origin 提交代码(即 fork 下来的远程仓库),然后向 upstream 提交一个 pull request。如果提交的代码和最新的代码发生冲突,再从 upstream 拉取最新的代码,和本地分支解决冲突,再提交到 origin。 ``` #### 2. 配置 pre-commit 在本地开发环境中,我们使用 [pre-commit](https://pre-commit.com/#intro) 来检查代码风格,以确保代码风格的统一。在提交代码,需要先安装 pre-commit(需要在 MMCV 目录下执行): ```shell pip install -U pre-commit pre-commit install ``` 检查 pre-commit 是否配置成功,并安装 `.pre-commit-config.yaml` 中的钩子: ```shell pre-commit run --all-files ``` ```{note} 如果你是中国用户,由于网络原因,可能会出现安装失败的情况,这时可以使用国内源 pre-commit install -c .pre-commit-config-zh-cn.yaml pre-commit run --all-files -c .pre-commit-config-zh-cn.yaml ``` 如果安装过程被中断,可以重复执行 `pre-commit run ...` 继续安装。 如果提交的代码不符合代码风格规范,pre-commit 会发出警告,并自动修复部分错误。 如果我们想临时绕开 pre-commit 的检查提交一次代码,可以在 `git commit` 时加上 `--no-verify`(需要保证最后推送至远程仓库的代码能够通过 pre-commit 检查)。 ```shell git commit -m "xxx" --no-verify ``` #### 3. 创建开发分支 安装完 pre-commit 之后,我们需要基于 main 创建开发分支,建议的分支命名规则为 `username/pr_name`。 ```shell git checkout -b yhc/refactor_contributing_doc ``` 在后续的开发中,如果本地仓库的 main 分支落后于 upstream 的 main 分支,我们需要先拉取 upstream 的代码进行同步,再执行上面的命令 ```shell git pull upstream main ``` #### 4. 提交代码并在本地通过单元测试 - MMCV 引入了 mypy 来做静态类型检查,以增加代码的鲁棒性。因此我们在提交代码时,需要补充 Type Hints。具体规则可以参考[教程](https://zhuanlan.zhihu.com/p/519335398)。 - 提交的代码同样需要通过单元测试 ```shell # 通过全量单元测试 pytest tests # 我们需要保证提交的代码能够通过修改模块的单元测试,以 runner 为例 pytest tests/test_runner/test_runner.py ``` 如果你由于缺少依赖无法运行修改模块的单元测试,可以参考[指引-单元测试](#单元测试) - 如果修改/添加了文档,参考[指引](#文档渲染)确认文档渲染正常。 #### 5. 推送代码到远程 代码通过单元测试和 pre-commit 检查后,将代码推送到远程仓库,如果是第一次推送,可以在 `git push` 后加上 `-u` 参数以关联远程分支 ```shell git push -u origin {branch_name} ``` 这样下次就可以直接使用 `git push` 命令推送代码了,而无需指定分支和远程仓库。 #### 6. 提交拉取请求(PR) (1) 在 GitHub 的 Pull request 界面创建拉取请求 (2) 根据指引修改 PR 描述,以便于其他开发者更好地理解你的修改 描述规范详见[拉取请求规范](#拉取请求规范)   **注意事项** (a) PR 描述应该包含修改理由、修改内容以及修改后带来的影响,并关联相关 Issue(具体方式见[文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)) (b) 如果是第一次为 OpenMMLab 做贡献,需要签署 CLA (c) 检查提交的 PR 是否通过 CI(集成测试) MMCV 会在不同的平台(Linux、Window、Mac),基于不同版本的 Python、PyTorch、CUDA 对提交的代码进行单元测试,以保证代码的正确性,如果有任何一个没有通过,我们可点击上图中的 `Details` 来查看具体的测试信息,以便于我们修改代码。 (3) 如果 PR 通过了 CI,那么就可以等待其他开发者的 review,并根据 reviewer 的意见,修改代码,并重复 [4](#4-提交代码并本地通过单元测试)-[5](#5-推送代码到远程) 步骤,直到 reviewer 同意合入 PR。 所有 reviewer 同意合入 PR 后,我们会尽快将 PR 合并到主分支。 #### 7. 解决冲突 随着时间的推移,我们的代码库会不断更新,这时候,如果你的 PR 与主分支存在冲突,你需要解决冲突,解决冲突的方式有两种: ```shell git fetch --all --prune git rebase upstream/main ``` 或者 ```shell git fetch --all --prune git merge upstream/main ``` 如果你非常善于处理冲突,那么可以使用 rebase 的方式来解决冲突,因为这能够保证你的 commit log 的整洁。如果你不太熟悉 `rebase` 的使用,那么可以使用 `merge` 的方式来解决冲突。 ### 指引 #### 单元测试 如果你无法正常执行部分模块的单元测试,例如 [video](https://github.com/open-mmlab/mmcv/tree/main/mmcv/video) 模块,可能是你的当前环境没有安装以下依赖 ```shell # Linux sudo apt-get update -y sudo apt-get install -y libturbojpeg sudo apt-get install -y ffmpeg # Windows conda install ffmpeg ``` 在提交修复代码错误或新增特性的拉取请求时,我们应该尽可能的让单元测试覆盖所有提交的代码,计算单元测试覆盖率的方法如下 ```shell python -m coverage run -m pytest /path/to/test_file python -m coverage html # check file in htmlcov/index.html ``` #### 文档渲染 在提交修复代码错误或新增特性的拉取请求时,可能会需要修改/新增模块的 docstring。我们需要确认渲染后的文档样式是正确的。 本地生成渲染后的文档的方法如下 ```shell pip install -r requirements/docs.txt cd docs/zh_cn/ # or docs/en make html # check file in ./docs/zh_cn/_build/html/index.html ``` ### 代码风格 #### Python [PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范,我们使用以下工具检查和格式化代码 - [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具,是多个检查工具的封装 - [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具 - [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具 - [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误 - [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具 - [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具 yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到 通过配置 [pre-commit hook](https://pre-commit.com/) ,我们可以在提交代码时自动检查和格式化 `flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`, 修复 `end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`,调整 `requirments.txt` 的包顺序。 pre-commit 钩子的配置可以在 [.pre-commit-config](./.pre-commit-config.yaml) 找到。 pre-commit 具体的安装使用方式见[拉取请求](#2-配置-pre-commit)。 更具体的规范请参考 [OpenMMLab 代码规范](code_style.md)。 #### C++ and CUDA C++ 和 CUDA 的代码规范遵从 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) ### 拉取请求规范 1. 使用 [pre-commit hook](https://pre-commit.com),尽量减少代码风格相关问题 2. 一个`拉取请求`对应一个短期分支 3. 粒度要细,一个`拉取请求`只做一件事情,避免超大的`拉取请求` - Bad:实现 Faster R-CNN - Acceptable:给 Faster R-CNN 添加一个 box head - Good:给 box head 增加一个参数来支持自定义的 conv 层数 4. 每次 Commit 时需要提供清晰且有意义 commit 信息 5. 提供清晰且有意义的`拉取请求`描述 - 标题写明白任务名称,一般格式:\[Prefix\] Short description of the pull request (Suffix) - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review) - 描述里介绍`拉取请求`的主要修改内容,结果,以及对其他部分的影响, 参考`拉取请求`模板 - 关联相关的`议题` (issue) 和其他`拉取请求` 6. 如果引入了其他三方库,或借鉴了三方库的代码,请确认他们的许可证和 mmcv 兼容,并在借鉴的代码上补充 `This code is inspired from http://` ================================================ FILE: docs/zh_cn/community/pr.md ================================================ ## 拉取请求 本文档的内容已迁移到[贡献指南](contributing.md)。 ================================================ FILE: docs/zh_cn/compatibility.md ================================================ ### v2.0.0 OpenMMLab 团队于 2022 年 9 月 1 日在世界人工智能大会发布了新一代训练引擎 [MMEngine](https://github.com/open-mmlab/mmengine),它是一个用于训练深度学习模型的基础库。相比于 MMCV,它提供了更高级且通用的训练器、接口更加统一的开放架构以及可定制化程度更高的训练流程。 OpenMMLab 团队于 2023 年 4 月 6 日发布 MMCV [v2.0.0](https://github.com/open-mmlab/mmcv/releases/tag/v2.0.0)。在 2.x 版本中,它有以下重大变化: (1)删除了以下组件: - `mmcv.fileio` 模块,删除于 PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179)。在需要使用 FileIO 的地方使用 mmengine 中的 FileIO 模块 - `mmcv.runner`、`mmcv.parallel`、`mmcv.engine` 和 `mmcv.device`,删除于 PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216) - `mmcv.utils` 的所有类(例如 `Config` 和 `Registry`)和大部分函数,删除于 PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217),只保留少数和 mmcv 相关的函数 - `mmcv.onnx`、`mmcv.tensorrt` 模块以及相关的函数,删除于 PR [#2225](https://github.com/open-mmlab/mmcv/pull/2225) - 删除 MMCV 所有的根注册器并将类或者函数注册到 MMEngine 的[根注册器](https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py) (2)新增了 [`mmcv.transforms`](https://github.com/open-mmlab/mmcv/tree/main/mmcv/transforms) 数据变换模块 (3)在 PR [#2235](https://github.com/open-mmlab/mmcv/pull/2235) 中将包名 **mmcv** 重命名为 **mmcv-lite**、 **mmcv-full** 重命名为 **mmcv**。此外,将环境变量 `MMCV_WITH_OPS` 的默认值从 0 改为 1
MMCV < 2.0 MMCV >= 2.0
```bash # 包含算子,因为 mmcv-full 的最高版本小于 2.0.0,所以无需加版本限制 pip install openmim mim install mmcv-full # 不包含算子 pip install openmim mim install "mmcv < 2.0.0" ``` ```bash # 包含算子 pip install openmim mim install mmcv # 不包含算子,因为 mmcv-lite 的起始版本为 2.0.0,所以无需加版本限制 pip install openmim mim install mmcv-lite ```
### v1.3.18 部分自定义算子对于不同的设备有不同实现,为此添加的大量宏命令与类型检查使得代码变得难以维护。例如: ```c++ if (input.device().is_cuda()) { #ifdef MMCV_WITH_CUDA CHECK_CUDA_INPUT(input); CHECK_CUDA_INPUT(rois); CHECK_CUDA_INPUT(output); CHECK_CUDA_INPUT(argmax_y); CHECK_CUDA_INPUT(argmax_x); roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); #else AT_ERROR("RoIAlign is not compiled with GPU support"); #endif } else { CHECK_CPU_INPUT(input); CHECK_CPU_INPUT(rois); CHECK_CPU_INPUT(output); CHECK_CPU_INPUT(argmax_y); CHECK_CPU_INPUT(argmax_x); roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } ``` 为此我们设计了注册与分发的机制以更好的管理这些算子实现。 ```c++ void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned); void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { ROIAlignForwardCUDAKernelLauncher( input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } // 注册算子的cuda实现 void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned); REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda); // roi_align.cpp // 使用dispatcher根据参数中的Tensor device类型对实现进行分发 void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output, Tensor argmax_y, Tensor argmax_x, int aligned_height, int aligned_width, float spatial_scale, int sampling_ratio, int pool_mode, bool aligned) { DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned); } ``` ### v1.3.11 为了灵活地支持更多的后端和硬件,例如 `NVIDIA GPUs` 、`AMD GPUs`,我们重构了 `mmcv/ops/csrc` 目录。注意,这次重构不会影响 API 的使用。更多相关信息,请参考 [PR1206](https://github.com/open-mmlab/mmcv/pull/1206)。 原始的目录结构如下所示 ``` . ├── common_cuda_helper.hpp ├── ops_cuda_kernel.cuh ├── pytorch_cpp_helper.hpp ├── pytorch_cuda_helper.hpp ├── parrots_cpp_helper.hpp ├── parrots_cuda_helper.hpp ├── parrots_cudawarpfunction.cuh ├── onnxruntime │   ├── onnxruntime_register.h │   ├── onnxruntime_session_options_config_keys.h │   ├── ort_mmcv_utils.h │   ├── ... │   ├── onnx_ops.h │   └── cpu │ ├── onnxruntime_register.cpp │      ├── ... │      └── onnx_ops_impl.cpp ├── parrots │   ├── ... │   ├── ops.cpp │   ├── ops_cuda.cu │   ├── ops_parrots.cpp │   └── ops_pytorch.h ├── pytorch │   ├── ... │   ├── ops.cpp │   ├── ops_cuda.cu │   ├── pybind.cpp └── tensorrt ├── trt_cuda_helper.cuh ├── trt_plugin_helper.hpp ├── trt_plugin.hpp ├── trt_serialize.hpp ├── ... ├── trt_ops.hpp └── plugins    ├── trt_cuda_helper.cu    ├── trt_plugin.cpp    ├── ...    ├── trt_ops.cpp    └── trt_ops_kernel.cu ``` 重构之后,它的结构如下所示 ``` . ├── common │ ├── box_iou_rotated_utils.hpp │ ├── parrots_cpp_helper.hpp │ ├── parrots_cuda_helper.hpp │ ├── pytorch_cpp_helper.hpp │ ├── pytorch_cuda_helper.hpp │   └── cuda │   ├── common_cuda_helper.hpp │   ├── parrots_cudawarpfunction.cuh │   ├── ... │   └── ops_cuda_kernel.cuh ├── onnxruntime │   ├── onnxruntime_register.h │   ├── onnxruntime_session_options_config_keys.h │   ├── ort_mmcv_utils.h │   ├── ... │   ├── onnx_ops.h │   └── cpu │ ├── onnxruntime_register.cpp │      ├── ... │      └── onnx_ops_impl.cpp ├── parrots │   ├── ... │   ├── ops.cpp │   ├── ops_parrots.cpp │   └── ops_pytorch.h ├── pytorch │   ├── info.cpp │   ├── pybind.cpp │   ├── ... │   ├── ops.cpp │   └── cuda │      ├── ... │      └── ops_cuda.cu └── tensorrt ├── trt_cuda_helper.cuh ├── trt_plugin_helper.hpp ├── trt_plugin.hpp ├── trt_serialize.hpp ├── ... ├── trt_ops.hpp └── plugins    ├── trt_cuda_helper.cu    ├── trt_plugin.cpp    ├── ...    ├── trt_ops.cpp    └── trt_ops_kernel.cu ``` ================================================ FILE: docs/zh_cn/conf.py ================================================ # # Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/master/config # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys import pytorch_sphinx_theme from sphinx.builders.html import StandaloneHTMLBuilder sys.path.insert(0, os.path.abspath('../..')) version_file = '../../mmcv/version.py' with open(version_file) as f: exec(compile(f.read(), version_file, 'exec')) __version__ = locals()['__version__'] # -- Project information ----------------------------------------------------- project = 'mmcv' copyright = '2018-2022, OpenMMLab' author = 'MMCV Authors' # The short X.Y version version = __version__ # The full version, including alpha/beta/rc tags release = __version__ # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', 'sphinx.ext.autosectionlabel', 'sphinx_markdown_tables', 'myst_parser', 'sphinx_copybutton', ] # yapf: disable myst_heading_anchors = 4 myst_enable_extensions = ['colon_fence'] # Configuration for intersphinx intersphinx_mapping = { 'python': ('https://docs.python.org/3', None), 'numpy': ('https://numpy.org/doc/stable', None), 'torch': ('https://pytorch.org/docs/stable/', None), 'mmengine': ('https://mmengine.readthedocs.io/en/latest', None), } autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision'] autosectionlabel_prefix_document = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = { '.rst': 'restructuredtext', '.md': 'markdown', } # The master toctree document. master_doc = 'index' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = 'zh_CN' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = 'sphinx' # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # # html_theme = 'sphinx_rtd_theme' html_theme = 'pytorch_sphinx_theme' html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # html_theme_options = { 'menu': [ { 'name': 'GitHub', 'url': 'https://github.com/open-mmlab/mmcv' }, ], # Specify the language of shared menu 'menu_lang': 'cn', } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] html_css_files = ['css/readthedocs.css'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = 'mmcvdoc' # -- Options for LaTeX output ------------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'mmcv.tex', 'mmcv Documentation', 'MMCV Contributors', 'manual'), ] # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, 'mmcv', 'mmcv Documentation', [author], 1)] # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'mmcv', 'mmcv Documentation', author, 'mmcv', 'One line description of project.', 'Miscellaneous'), ] # -- Options for Epub output ------------------------------------------------- # Bibliographic Dublin Core info. epub_title = project # The unique identifier of the text. This can be a ISBN number # or the project homepage. # # epub_identifier = '' # A unique identification for the text. # # epub_uid = '' # A list of files that should not be packed into the epub file. epub_exclude_files = ['search.html'] # set priority when building html StandaloneHTMLBuilder.supported_image_types = [ 'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg' ] # -- Extension configuration ------------------------------------------------- # Ignore >>> when copying code copybutton_prompt_text = r'>>> |\.\.\. ' copybutton_prompt_is_regexp = True ================================================ FILE: docs/zh_cn/docutils.conf ================================================ [html writers] table_style: colwidths-auto ================================================ FILE: docs/zh_cn/faq.md ================================================ ## 常见问题 在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题,并且知道可以帮到大家的解决办法, 欢迎随时丰富这个列表。 ### 安装问题 - KeyError: "xxx: 'yyy is not in the zzz registry'" 只有模块所在的文件被导入时,注册机制才会被触发,所以您需要在某处导入该文件,更多详情请查看 [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974)。 - "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'" 1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv 2. 参考 [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) 或者 [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) 安装 mmcv-full - "invalid device function" 或者 "no kernel image is available for execution" 1. 检查 GPU 的 CUDA 计算能力 2. 运行 `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的,您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV。兼容性问题可能会出现在使用旧版的 GPUs,如:colab 上的 Tesla K80 (3.7) 3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如,您可能使用 CUDA 10.0 编译 mmcv,但在 CUDA 9.0 的环境中运行它 - "undefined symbol" 或者 "cannot open xxx.so" 1. 如果符号和 CUDA/C++ 相关(例如:libcudart.so 或者 GLIBCXX),请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致 2. 如果符号和 PyTorch 相关(例如:符号包含 caffe、aten 和 TH),请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致 3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同 - "RuntimeError: CUDA error: invalid configuration argument" 这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低 [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10) 的值并重新编译 mmcv。 - "RuntimeError: nms is not compiled with GPU support" 这个错误是由于您的 CUDA 环境没有正确安装。 您可以尝试重新安装您的 CUDA 环境,然后删除 mmcv/build 文件夹并重新编译 mmcv。 - "Segmentation fault" 1. 检查 GCC 的版本,通常是因为 PyTorch 版本与 GCC 版本不匹配 (例如 GCC \< 4.9 ),我们推荐用户使用 GCC 5.4,我们也不推荐使用 GCC 5.5, 因为有反馈 GCC 5.5 会导致 "segmentation fault" 并且切换到 GCC 5.4 就可以解决问题 2. 检查是否正确安装 CUDA 版本的 PyTorc。输入以下命令并检查是否返回 True ```shell python -c 'import torch; print(torch.cuda.is_available())' ``` 3. 如果 `torch` 安装成功,那么检查 MMCV 是否安装成功。输入以下命令,如果没有报错说明 mmcv-full 安装成。 ```shell python -c 'import mmcv; import mmcv.ops' ``` 4. 如果 MMCV 与 PyTorch 都安装成功了,则可以使用 `ipdb` 设置断点或者使用 `print` 函数,分析是哪一部分的代码导致了 `segmentation fault` - "libtorch_cuda_cu.so: cannot open shared object file" `mmcv-full` 依赖 `libtorch_cuda_cu.so` 文件,但程序运行时没能找到该文件。我们可以检查该文件是否存在 `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` 也可以尝试重装 PyTorch。 - "fatal error C1189: #error: -- unsupported Microsoft Visual Studio version!" 如果您在 Windows 上编译 mmcv-full 并且 CUDA 的版本是 9.2,您很可能会遇到这个问题 `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error: -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`,您可以尝试使用低版本的 Microsoft Visual Studio,例如 vs2017。 - "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized" 如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.5.0,您很可能会遇到这个问题 `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`。解决这个问题的方法是将 `torch/csrc/jit/api/module.h` 文件中所有 `static constexpr bool all_slots = false;` 替换为 `static bool all_slots = false;`。更多细节可以查看 [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394)。 - "error: a member with an in-class initializer must be const" 如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.6.0,您很可能会遇到这个问题 `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. 解决这个问题的方法是将 `torch/include\torch/csrc/jit/api/module.h` 文件中的所有 `CONSTEXPR_EXCEPT_WIN_CUDA ` 替换为 `const`。更多细节可以查看 [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575)。 - "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized" 如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.7.0,您很可能会遇到这个问题 `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. 解决这个问题的方法是修改 PyTorch 中的几个文件: - 删除 `torch/include\torch/csrc/jit/ir/ir.h` 文件中的 `static constexpr Symbol Kind = ::c10::prim::profile;` 和 `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` - 将 `torch\include\pybind11\cast.h` 文件中的 `explicit operator type&() { return *(this->value); }` 替换为 `explicit operator type&() { return *((type*)this->value); }` - 将 `torch/include\torch/csrc/jit/api/module.h` 文件中的 所有 `CONSTEXPR_EXCEPT_WIN_CUDA` 替换为 `const` 更多细节可以查看 [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956)。 - MMCV 和 MMDetection 的兼容性问题;"ConvWS is already registered in conv layer" 请参考 [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 为您的 MMDetection 版本安装正确版本的 MMCV。 ### 使用问题 - "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one" 1. 这个错误是因为有些参数没有参与 loss 的计算,可能是代码中存在多个分支,导致有些分支没有参与 loss 的计算。更多细节见 [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582)。 2. 你可以设置 DDP 中的 `find_unused_parameters` 为 `True`,或者手动查找哪些参数没有用到。 - "RuntimeError: Trying to backward through the graph a second time" 不能同时设置 `GradientCumulativeOptimizerHook` 和 `OptimizerHook`,这会导致 `loss.backward()` 被调用两次,于是程序抛出 `RuntimeError`。我们只需设置其中的一个。更多细节见 [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379)。 ================================================ FILE: docs/zh_cn/get_started/api_reference.md ================================================ # 接口对照表 由于 MMCV v1.x 升级到 MMCV v2.x 时移除了 `mmcv.fileio`,`mmcv.runner`,`mmcv.parallel`,`mmcv.engine`,`mmcv.device` 模块,以及 `mmcv.utils` 中的所有类和大部分函数,分别删除于 PR [#2179](https://github.com/open-mmlab/mmcv/pull/2179),PR [#2216](https://github.com/open-mmlab/mmcv/pull/2216),PR [#2217](https://github.com/open-mmlab/mmcv/pull/2217)。因此我们提供了如下的接口对照表,以便于大家快速查找迁移后的接口。 ## 相关讨论 - [Remove runner, parallel, engine and device](https://github.com/open-mmlab/mmcv/pull/2216) - [ImportError: cannot import name 'is_list_of' from 'mmcv.utils'](https://github.com/open-mmlab/mmcv/issues/2282) - [Could not find the files in MMengine which are removed in MMCV_v2x parallel. example, for DataContainer](https://github.com/open-mmlab/mmcv/issues/2934) - [mmcv.cnn.bricks.registry](https://github.com/open-mmlab/mmengine/discussions/1356) - [Replace mmcv's function and modules imported with mmengine's](https://github.com/open-mmlab/mmdetection/pull/8594) ## `mmcv.fileio` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ------------------------------------------------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | | mmcv.fileio.file_client.BaseStorageBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.base.BaseStorageBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/base.py | | mmcv.fileio.file_client.CephBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | | | | mmcv.fileio.file_client.PetrelBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.petrel_backend.PetrelBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/petrel_backend.py | | mmcv.fileio.file_client.MemcachedBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.memcached_backend.MemcachedBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/memcached_backend.py | | mmcv.fileio.file_client.LmdbBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.lmdb_backend.LmdbBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/lmdb_backend.py | | mmcv.fileio.file_client.HardDiskBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.file_client.HardDiskBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py | | mmcv.fileio.file_client.HTTPBackend | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.backends.http_backend.HTTPBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/backends/http_backend.py | | mmcv.fileio.file_client.FileClient | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/file_client.py | mmengine.fileio.file_client.FileClient | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/file_client.py | | mmcv.fileio.io.load | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py | mmengine.fileio.io.load | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py | | mmcv.fileio.io.dump | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py | mmengine.fileio.io.dump | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/io.py | | mmcv.fileio.io.\_register_handler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py | mmengine.fileio.handlers.\_register_handler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py | | mmcv.fileio.io.register_handler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/io.py | mmengine.fileio.handlers.register_handler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/registry_utils.py | | mmcv.fileio.parse.list_from_file | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py | mmengine.fileio.parse.list_from_file | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py | | mmcv.fileio.parse.dict_from_file | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/parse.py | mmengine.fileio.parse.dict_from_file | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/parse.py | | mmcv.fileio.handlers.base.BaseFileHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/base.py | mmengine.fileio.handlers.base.BaseFileHandler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/base.py | | mmcv.fileio.handlers.json_handler.set_default | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py | mmengine.fileio.handlers.json_handler.set_default | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py | | mmcv.fileio.handlers.json_handler.JsonHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/json_handler.py | mmengine.fileio.handlers.json_handler.JsonHandler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/json_handler.py | | mmcv.fileio.handlers.pickle_handler.PickleHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/pickle_handler.py | mmengine.fileio.handlers.pickle_handler.PickleHandler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/pickle_handler.py | | mmcv.fileio.handlers.yaml_handler.YamlHandler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/fileio/handlers/yaml_handler.py | mmengine.fileio.handlers.yaml_handler.YamlHandler | https://github.com/open-mmlab/mmengine/blob/main/mmengine/fileio/handlers/yaml_handler.py | ## `mmcv.runner` | MMCV | MMCV URL | MMEngine | MMEngine URL | | --------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | mmcv.runner.hooks.logger.base.LoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/base.py | mmengine.hooks.logger_hook.LoggerHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py | | mmcv.runner.hooks.logger.clearml.ClearMLLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/clearml.py | 相似功能:mmengine.visualization.vis_backend.ClearMLVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.dvclive.DvcliveLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/dvclive.py | 相似功能:mmengine.visualization.vis_backend.DVCLiveVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.mlflow.MlflowLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/mlflow.py | 相似功能:mmengine.visualization.vis_backend.MLflowVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.neptune.NeptuneLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/neptune.py | 相似功能:mmengine.visualization.vis_backend.NeptuneVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.pavi.PaviLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/pavi.py | | | | mmcv.runner.hooks.logger.segmind.SegmindLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/segmind.py | | | | mmcv.runner.hooks.logger.tensorboard.TensorboardLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/tensorboard.py | 相似功能:mmengine.visualization.vis_backend.TensorboardVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.logger.text.TextLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/text.py | | | | mmcv.runner.hooks.logger.wandb.WandbLoggerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/logger/wandb.py | 相似功能:mmengine.visualization.vis_backend.WandbVisBackend | https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py | | mmcv.runner.hooks.checkpoint.CheckpointHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/checkpoint.py | mmengine.hooks.checkpoint_hook.CheckpointHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py | | mmcv.runner.hooks.closure.ClosureHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/closure.py | | | | mmcv.runner.hooks.ema.EMAHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/ema.py | mmengine.hooks.ema_hook.EMAHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/ema_hook.py | | mmcv.runner.hooks.evaluation.EvalHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py | 部分功能被移至 mmengine.hooks.checkpoint_hook.CheckpointHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py | | mmcv.runner.hooks.evaluation.DistEvalHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/evaluation.py | 部分功能被移至 mmengine.hooks.checkpoint_hook.CheckpointHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py | | mmcv.runner.hooks.hook.HOOKS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py | mmengine.registry.root.HOOKS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.runner.hooks.hook.Hook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/hook.py | mmengine.hooks.hook.Hook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/hook.py | | mmcv.runner.hooks.iter_timer.IterTimerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/iter_timer.py | mmengine.hooks.iter_timer_hook.IterTimerHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/iter_timer_hook.py | | mmcv.runner.hooks.lr_updater.LrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.LRSchedulerMixin | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.FixedLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.ConstantLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.StepLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.StepLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.ExpLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.ExponentialLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.PolyLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.PolyLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.InvLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | | | | mmcv.runner.hooks.lr_updater.CosineAnnealingUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.CosineAnnealingLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.FlatCosineAnnealingUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | | | | mmcv.runner.hooks.lr_updater.CosineRestartLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.get_position_from_periods | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.CosineRestartLR.get_position_from_periods | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.CyclicLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | | | | mmcv.runner.hooks.lr_updater.OneCycleLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.OneCycleLR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.LinearAnnealingLrUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | | | | mmcv.runner.hooks.lr_updater.annealing_cos | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_annealing_cos | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.annealing_linear | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_annealing_linear | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.lr_updater.format_param | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/lr_updater.py | mmengine.optim.scheduler.lr_scheduler.OneCycleLR.\_format_param | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py | | mmcv.runner.hooks.memory.EmptyCacheHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/memory.py | mmengine.hoos.empty_cache_hook.EmptyCacheHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/empty_cache_hook.py | | mmcv.runner.hooks.momentum_updater.MomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | mmengine.optim.scheduler.momentum_scheduler.MomentumSchedulerMixin | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py | | mmcv.runner.hooks.momentum_updater.StepMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | mmengine.optim.scheduler.momentum_scheduler.StepMomentum | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py | | mmcv.runner.hooks.momentum_updater.CosineAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | mmengine.optim.scheduler.momentum_scheduler.CosineAnnealingMomentum | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/momentum_scheduler.py | | mmcv.runner.hooks.momentum_updater.LinearAnnealingMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | | | | mmcv.runner.hooks.momentum_updater.CyclikcMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | | | | mmcv.runner.hooks.momentum_updater.OneCycleMomentumUpdaterHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/momentum_updater.py | | | | mmcv.runner.hooks.optimizer.OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | mmengine.optimizer.optimizer_wrapper.OptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/optimizer_wrapper.py | | mmcv.runner.hooks.optimizer.GradientCumulativeOptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | | | | mmcv.runner.hooks.optimizer.Fp16OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | 相关功能被移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | | | | mmcv.runner.hooks.optimizer.Fp16OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | 相关功能被移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.hooks.optimizer.GradientCumulativeFp16OptimizerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/optimizer.py | | | | mmcv.runner.hooks.profiler.ProfilerHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/profiler.py | mmengine.hooks.profiler_hook.ProfilerHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/profiler_hook.py | | mmcv.runner.hooks.sampler_seed.DistSamplerSeedHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sampler_seed.py | mmengine.hooks.sampler_seed_hook.DistSamplerSeedHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sampler_seed_hook.py | | mmcv.runner.hooks.sync_buffer.SyncbuffersHook | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/hooks/sync_buffer.py | mmengine.hooks.sync_buffer_hook.SyncBufferHook | https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sync_buffer_hook.py | | mmcv.runner.optimizer.builder.OPTIMIZERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | mmengine.registry.root.OPTIMIZERS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.runner.optimizer.builder.OPTIMIZER_BUILDERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | | | | mmcv.runner.optimizer.builder.register_torch_optimizers | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | mmengine.optim.optimizer.builder.register_torch_optimizers | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/builder.py | | mmcv.runner.optimizer.builder.TORCH_OPTIMIZERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | | | | mmcv.runner.optimizer.builder.build_optimizer_constructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | | | | mmcv.runner.optimizer.builder.build_optimizer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/builder.py | | | | mmcv.runner.optimizer.default_constructor.DefaultOptimizerConstructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/optimizer/default_constructor.py | | | | mmcv.runner.base_module.BaseModule | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py | mmengine.model.base_module.BaseModule | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py | | mmcv.runner.base_module.Sequential | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py | mmengine.model.base_module.Sequential | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py | | mmcv.runner.base_module.ModuleList | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py | mmengine.model.base_module.ModuleList | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py | | mmcv.runner.base_module.ModuleDict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_module.py | mmengine.model.base_module.ModuleDict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_module.py | | mmcv.runner.base_runner.BaseRunner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/base_runner.py | mmengine.runner.runner.Runner | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py | | mmcv.runner.builder.RUNNERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py | mmengine.registry.root.RUNNERS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.runner.builder.RUNNER_BUILDERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py | mmengine.registry.root.RUNNER_CONSTRUCTORS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.runner.builder.build_runner_constructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py | | | | mmcv.runner.builder.build_runner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/builder.py | | | | mmcv.runner.checkpoint.ENV_MMCV_HOME | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.ENV_MMENGINE_HOME | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.ENV_XDG_CACHE_HOME | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.ENV_XDG_CACHE_HOME | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.DEFAULT_CACHE_HOME | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.DEFAULT_CACHE_DIR | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_get_mmcv_home | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_get_mmengine_home | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_state_dict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_state_dict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_torchvision_models | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_torchvision_models | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_external_models | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_external_models | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_mmcls_models | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_mmcls_models | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_deprecated_model_names | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_deprecated_model_names | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_process_mmcls_checkpoint | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_process_mmcls_checkpoint | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.CheckpointLoader | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.CheckpointLoader | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_local | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_local | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_http | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_http | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_pavi | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_pavi | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_ceph | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_ceph | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_torchvision | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_torchvision | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_openmmlab | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_openmmlab | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_from_mmcls | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_from_mmcls | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_load_checkpoint | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_load_checkpoint | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_load_checkpoint_with_prefix | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_load_checkpoint_with_prefix | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.load_checkpoint | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.load_checkpoint | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.weights_to_cpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.weights_to_cpu | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.\_save_to_state_dict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.\_save_to_state_dict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.get_state_dict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.get_state_dict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.checkpoint.save_checkpoint | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/checkpoint.py | mmengine.runner.checkpoint.save_checkpoint | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/checkpoint.py | | mmcv.runner.default_coonstructor.DefaultRunnerConstructor | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/default_constructor.py | | | | mmcv.runner.dist_utils.\_find_free_port | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.dist_utils.\_is_free_port | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.dist_utils.init_dist | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.init_dist | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.\_init_dist_pytorch | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.\_init_dist_pytorch | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.\_init_dist_mpi | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.\_init_dist_mpi | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.\_init_dist_slurm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.\_init_dist_slurm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.get_dist_info | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.get_dist_info | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.master_only | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | mmengine.dist.utils.master_only | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dist/utils.py | | mmcv.runner.dist_utils.allreduce_params | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.dist_utils.allreduce_grads | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.dist_utils.\_allreduce_coalesced | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/dist_utils.py | | | | mmcv.runner.epoch_based_runner.EpochBasedRunner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py | mmengine.runner.loops.EpochBasedTrainLoop | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py | | mmcv.runner.epoch_based_runner.Runner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/epoch_based_runner.py | | | | mmcv.runner.fp16_utils.cast_tensor_type | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.auto_fp16 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.force_fp32 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.allreduce_grads | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.wrap_fp16_model | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.patch_norm_fp32 | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.patch_forward_method | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.fp16_utils.LossScaler | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/fp16_utils.py | fp16 相关移至 mmengine.optimizer.amp_optimizer_wrapper.AmpOptimWrapper 以及 mmengine.optimizer.apex_optimizer_wrapper.ApexOptimWrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/apex_optimizer_wrapper.py | | mmcv.runner.iter_based_runner.IterLoader | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py | | | | mmcv.runner.iter_based_runner.IterBasedRunner | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/iter_based_runner.py | mmengine.runner.loops.IterBasedTrainLoop | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py | | mmcv.runner.log_buffer.LogBuffer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/log_buffer.py | | | | mmcv.runner.priority.Priority | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py | mmengine.runer.priority.Priority | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py | | mmcv.runner.priority.get_priority | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/priority.py | mmengine.runner.priority.get_priority | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/priority.py | | mmcv.runner.utils.get_host_info | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py | | | | mmcv.runner.utils.get_time_str | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py | | | | mmcv.runner.utils.obj_from_dict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py | | | | mmcv.runner.utils.set_random_seed | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/runner/utils.py | mmengine.runner.utils.set_random_seed | https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/utils.py | ## `mmcv.parallel` | MMCV | MMCV URL | MMEngine | MMEngine URL | | -------------------------------------------------------------- | ------------------------------------------------------------------------------ | --------------------------------------------------------------- | ----------------------------------------------------------------------------------------- | | mmcv.parallel.\_functions.scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py | | | | mmcv.parallel.\_functions.synchronize_stream | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py | | | | mmcv.parallel.\_functions.get_input_device | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py | | | | mmcv.parallel.\_functions.Scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/_functions.py | | | | mmcv.parallel.collate.collate | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/collate.py | | | | mmcv.parallel.data_container.assert_tensor_type | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py | | | | mmcv.parallel.data_container.DataContainer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_container.py | 相似功能:mmengine/structures/base_data_element.BaseDataElement | https://github.com/open-mmlab/mmengine/blob/main/mmengine/structures/base_data_element.py | | mmcv.parallel.data_parallel.MMDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/data_parallel.py | | | | mmcv.parallel.distributed.MMDistributedDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py | mmengine.model.wrappers.distributed.MMDistributedDataParallel | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py | | mmcv.parallel.distributed_deprecated.MMDistributedDataParallel | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/distributed.py | mmengine.model.wrappers.distributed.MMDistributedDataParallel | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/distributed.py | | mmcv.parallel.registry.MODULE_WRAPPERS | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/registry.py | mmengine.registry.root.MODEL_WRAPPERS | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/root.py | | mmcv.parallel.scatter_gather.scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py | | | | mmcv.parallel.scatter_gather.scatter_kwargs | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/scatter_gather.py | | | | mmcv.parallel.utils.is_module_wrapper | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/parallel/utils.py | mmengine.model.wrappers.utils.is_model_wrapper | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/wrappers/utils.py | ## `mmcv.engine` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ------------------------------------ | ------------------------------------------------------------------ | -------- | ------------ | | mmcv.engine.test.single_gpu_test | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py | | | | mmcv.engine.test.multi_gpu_test | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py | | | | mmcv.engine.test.collect_results_cpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py | | | | mmcv.engine.test.collect_results_gpu | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/engine/test.py | | | ## `mmcv.device` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ----------------------------------------- | ---------------------------------------------------------------------------- | -------------------------------- | ------------------------------------------------------------------------- | | mmcv.device.ipu | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/ipu | | | | mmcv.device.mlu | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mlu | | | | mmcv.device.mps | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/mps | | | | mmcv.device.npu | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/device/npu | | | | mmcv.device.\_functions.scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py | | | | mmcv.device.\_functions.Scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/_functions.py | | | | mmcv.device.scatter_gather.scatter | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py | | | | mmcv.device.scatter_gather.scatter_kwargs | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/scatter_gather.py | | | | mmcv.device.utils.get_device | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/device/utils.py | mmengine.device.utils.get_device | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | ## `mmcv.utils` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ------------------------------------------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | | mmcv.utils.config.BASE_KEY | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.BASE_KEY | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.DELETE_KEY | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.DELETE_KEY | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.DEPRECATION_KEY | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.DEPRECATION_KEY | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.ConfigDict | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.ConfigDict | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.add_args | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.add_args | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.Config | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.Config | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.config.DictAction | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/config.py | mmengine.config.config.DictAction | https://github.com/open-mmlab/mmengine/blob/main/mmengine/config/config.py | | mmcv.utils.device_type.is_ipu_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | | | | mmcv.utils.device_type.IS_IPU_AVAILABLE | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | | | | mmcv.utils.device_type.is_mlu_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | mmengine.device.utils.is_mlu_available | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | | mmcv.utils.device_type.is_mps_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | mmengine.device.utils.is_mps_available | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | | mmcv.utils.device_type.is_npu_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/device_type.py | mmengine.device.utils.is_npu_available | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | | mmcv.utils.hub.\_is_legacy_zip_format | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py | mmengine.utils.dl_utils.hub.\_is_legacy_zip_format | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py | | mmcv.utils.hub.\_legacy_zip_load | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py | mmengine.utils.dl_utils.hub.\_legacy_zip_load | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py | | mmcv.utils.hub.load_url | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/hub.py | mmengine.utils.dl_utils.hub.load_url | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/hub.py | | mmcv.utils.logging.logger_initialized | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py | | | | mmcv.utils.logging.get_logger | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py | | | | mmcv.utils.logging.print_log | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/logging.py | | | | mmcv.utils.misc.\_ntuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.\_ntuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_1tuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_1tuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_2tuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_2tuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_3tuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_3tuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_4tuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_4tuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.to_ntuple | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.to_ntuple | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_str | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_str | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.import_modules_from_strings | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.import_modules_from_strings | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.iter_cast | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.iter_cast | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.list_cast | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.list_cast | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.tuple_cast | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.tuple_cast | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_seq_of | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_seq_of | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_list_of | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_list_of | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_tuple_of | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_tuple_of | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.slice_list | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.slice_list | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.concat_list | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.concat_list | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.check_prerequisites | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.check_prerequisites | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.\_check_py_package | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.\_check_py_package | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.\_check_executable | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.\_check_executable | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.requires_package | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.requires_package | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.requires_executable | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.requires_executable | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.deprecated_api_warning | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.deprecated_api_warning | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.is_method_overridden | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.is_method_overridden | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.misc.has_method | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/misc.py | mmengine.utils.misc.has_method | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/misc.py | | mmcv.utils.parrots_wrapper.TORCH_VERSION | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.TORCH_VERSION | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.is_cuda_available | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.devices.utils.is_cuda_available | https://github.com/open-mmlab/mmengine/blob/main/mmengine/device/utils.py | | mmcv.utils.parrots_wrapper.IS_CUDA_AVAILABLE | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | | | | mmcv.utils.parrots_wrapper.is_rocm_pytorch | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.is_rocm_pytorch | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_cuda_home | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_cuda_home | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.get_build_config | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.get_build_config | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_conv | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_conv | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_dataloader | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_dataloader | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_extension | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_extension | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_pool | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_pool | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.\_get_norm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.\_get_norm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.parrots_wrapper.SyncBatchNorm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/parrots_wrapper.py | mmengine.utils.dl_utils.parrots_wrapper.SyncBatchNorm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/parrots_wrapper.py | | mmcv.utils.path.is_filepath | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.is_filepath | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.fopen | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.fopen | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.check_file_exist | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.check_file_exist | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.mkdir_or_exist | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.mkdir_or_exist | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.symlink | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.symlink | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.scandir | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.scandir | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.path.find_vcs_root | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/path.py | mmengine.utils.path.find_vcs_root | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/path.py | | mmcv.utils.progressbar.ProgressBar | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.ProgressBar | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.progressbar.track_progress | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.track_progress | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.progressbar.init_pool | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.init_pool | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.progressbar.track_parallel_progress | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.track_parallel_progress | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.progressbar.track_iter_progress | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/progressbar.py | mmengine.utils.progressbar.track_iter_progress | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/progressbar.py | | mmcv.utils.registry.build_from_cfg | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py | mmengine.registry.build_functions.build_from_cfg | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/build_functions.py | | mmcv.utils.registry.Registry | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/registry.py | mmengine.registry.registry.Registry | https://github.com/open-mmlab/mmengine/blob/main/mmengine/registry/registry.py | | mmcv.utils.seed.worker_init_fn | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/seed.py | mmengine.dataset.utils.worker_init_fn | https://github.com/open-mmlab/mmengine/blob/main/mmengine/dataset/utils.py | | mmcv.utils.testing.check_python_script | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.check_python_script | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.\_any | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.\_any | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_dict_contains_subset | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_dict_contains_subset | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_attrs_equal | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_attrs_equal | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_dict_has_keys | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_dict_has_keys | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_keys_equal | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_keys_equal | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_is_norm_layer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_is_norm_layer | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.testing.assert_params_all_zeros | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/testing.py | mmengine.testing.compare.assert_params_all_zeros | https://github.com/open-mmlab/mmengine/blob/main/mmengine/testing/compare.py | | mmcv.utils.timer.TimerError | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py | mmengine.utils.timer.TimerError | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py | | mmcv.utils.timer.Timer | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py | mmengine.utils.timer.Timer | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py | | mmcv.utils.timer.\_g_timers | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py | mmengine.utils.timer.\_g_timers | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py | | mmcv.utils.timer.check_time | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/timer.py | mmengine.utils.timer.check_time | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/timer.py | | mmcv.utils.torch_ops.\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py | mmengine.utils.dl_utils.torch_ops.\_torch_version_meshgrid_indexing | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py | | mmcv.utils.torch_ops.torch_meshgrid | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/torch_ops.py | mmengine.utils.dl_utils.torch_ops.torch_meshgrid | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/torch_ops.py | | mmcv.utils.trace.is_jit_tracing | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/trace.py | mmengine.utils.dl_utils.trace.is_jit_tracing | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/dl_utils/trace.py | | mmcv.utils.version_utils.digit_version | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py | mmengine.utils.version_utils.digit_version | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py | | mmcv.utils.version_utils.\_minimal_ext_cmd | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py | mmengine.utils.version_utils.\_minimal_ext_cmd | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py | | mmcv.utils.version_utils.get_git_hash | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/utils/version_utils.py | mmengine.utils.version_utils.get_git_hash | https://github.com/open-mmlab/mmengine/blob/main/mmengine/utils/version_utils.py | ## `mmcv.cnn` | MMCV | MMCV URL | MMEngine | MMEngine URL | | -------------------------------------------- | ------------------------------------------------------------------------ | ------------------------------------------ | ------------------------------------------------------------------------ | | mmcv.cnn.utils.sync_bn.\_BatchNormXd | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.\_BatchNormXd | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py | | mmcv.cnn.utils.sync_bn.revert_sync_batchnorm | https://github.com/open-mmlab/mmcv/blob/v1.7.1/mmcv/cnn/utils/sync_bn.py | mmengine.model.utils.revert_sync_batchnorm | https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/utils.py | ## `mmcv.model_zoo` | MMCV | MMCV URL | MMEngine | MMEngine URL | | ------------------------------------ | ----------------------------------------------------------------------------------- | ---------------------------------- | ----------------------------------------------------------------------------------- | | mmcv.model_zoo.deprecated.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/deprecated.json | mmengine.hub.deprecated.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/deprecated.json | | mmcv.model_zoo.mmcls.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/mmcls.json | mmengine.hub.mmcls.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/mmcls.json | | mmcv.model_zoo.open_mmlab.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/open_mmlab.json | mmengine.hub.openmmlab.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/openmmlab.json | | mmcv.model_zoo.torchvision_0.12.json | https://github.com/open-mmlab/mmcv/tree/v1.7.1/mmcv/model_zoo/torchvision_0.12.json | mmengine.hub.torchvision_0.12.json | https://github.com/open-mmlab/mmengine/tree/main/mmengine/hub/torchvision_0.12.json | ================================================ FILE: docs/zh_cn/get_started/article.md ================================================ ## 解读文章汇总 这篇文章汇总了 [OpenMMLab](https://www.zhihu.com/people/openmmlab) 解读的部分文章(更多文章和视频见 [OpenMMLabCourse](https://github.com/open-mmlab/OpenMMLabCourse)),如果您有推荐的文章(不一定是 OpenMMLab 发布的文章,可以是自己写的文章),非常欢迎提 [Pull Request](http://127.0.0.1:5501/mmcv/docs/zh_cn/_build/html/community/pr.html) 添加到这里。 ### MMCV 解读文章 #### 框架解读 - [MMCV 核心组件分析(一):整体概述](https://zhuanlan.zhihu.com/p/336081587) - [MMCV 核心组件分析(二):FileHandler](https://zhuanlan.zhihu.com/p/336097883) - [MMCV 核心组件分析(三): FileClient](https://zhuanlan.zhihu.com/p/339190576) - [MMCV 核心组件分析(四): Config](https://zhuanlan.zhihu.com/p/346203167) - [MMCV 核心组件分析(五): Registry](https://zhuanlan.zhihu.com/p/355271993) - [MMCV 核心组件分析(六): Hook](https://zhuanlan.zhihu.com/p/355272220) - [MMCV 核心组件分析(七): Runner](https://zhuanlan.zhihu.com/p/355272459) - [MMCV Hook 食用指南](https://zhuanlan.zhihu.com/p/448600739) - [PyTorch & MMCV Dispatcher 机制解析](https://zhuanlan.zhihu.com/p/451671838) #### 工具解读 - [训练可视化工具哪款是你的菜?MMCV一行代码随你挑](https://zhuanlan.zhihu.com/p/387078211) #### 安装指南 - [久等了!Windows 平台 MMCV 的预编译包终于来了!](https://zhuanlan.zhihu.com/p/441653536) - [Windows 环境从零安装 mmcv-full](https://zhuanlan.zhihu.com/p/434491590) #### 知乎问答 - [深度学习科研,如何高效进行代码和实验管理?](https://www.zhihu.com/question/269707221/answer/2480772257) - [深度学习方面的科研工作中的实验代码有什么规范和写作技巧?如何妥善管理实验数据?](https://www.zhihu.com/question/268193800/answer/2586000037) ### 下游算法库解读文章 - [MMDetection](https://mmdetection.readthedocs.io/zh_CN/latest/article.html) ### PyTorch 解读文章 - [PyTorch1.11 亮点一览:TorchData、functorch、DDP 静态图](https://zhuanlan.zhihu.com/p/486222256) - [PyTorch1.12 亮点一览:DataPipe + TorchArrow 新的数据加载与处理范式](https://zhuanlan.zhihu.com/p/537868554) - [PyTorch 源码解读之 nn.Module:核心网络模块接口详解](https://zhuanlan.zhihu.com/p/340453841) - [PyTorch 源码解读之 torch.autograd:梯度计算详解](https://zhuanlan.zhihu.com/p/321449610) - [PyTorch 源码解读之 torch.utils.data:解析数据处理全流程](https://zhuanlan.zhihu.com/p/337850513) - [PyTorch 源码解读之 torch.optim:优化算法接口详解](https://zhuanlan.zhihu.com/p/346205754) - [PyTorch 源码解读之 DP & DDP:模型并行和分布式训练解析](https://zhuanlan.zhihu.com/p/343951042) - [PyTorch 源码解读之 BN & SyncBN:BN 与 多卡同步 BN 详解](https://zhuanlan.zhihu.com/p/337732517) - [PyTorch 源码解读之 torch.cuda.amp: 自动混合精度详解](https://zhuanlan.zhihu.com/p/348554267) - [PyTorch 源码解读之 cpp_extension:揭秘 C++/CUDA 算子实现和调用全流程](https://zhuanlan.zhihu.com/p/348555597) - [PyTorch 源码解读之即时编译篇](https://zhuanlan.zhihu.com/p/361101354) - [PyTorch 源码解读之分布式训练了解一下?](https://zhuanlan.zhihu.com/p/361314953) - [PyTorch 源码解读之 torch.serialization & torch.hub](https://zhuanlan.zhihu.com/p/364239544) ### 其他 - [困扰我 48 小时的深拷贝,今天终于...](https://zhuanlan.zhihu.com/p/470892209) - [拿什么拯救我的 4G 显卡](https://zhuanlan.zhihu.com/p/430123077) - [是谁偷偷动了我的 logger](https://zhuanlan.zhihu.com/p/481383590) - [三句话,让 logger 言听计从](https://zhuanlan.zhihu.com/p/487524917) - [Logging 不为人知的二三事](https://zhuanlan.zhihu.com/p/502610682) - [Type Hints 入门教程,让代码更加规范整洁](https://zhuanlan.zhihu.com/p/519335398) - [手把手教你如何高效地在 MMCV 中贡献算子](https://zhuanlan.zhihu.com/p/464492627) - [OpenMMLab 支持 IPU 训练芯片](https://zhuanlan.zhihu.com/p/517527926) - [基于 MMCV 走上开源大佬之路?](https://zhuanlan.zhihu.com/p/391144979) ================================================ FILE: docs/zh_cn/get_started/build.md ================================================ ## 从源码编译 MMCV ### 编译 mmcv 在编译 mmcv 之前,请确保 PyTorch 已经成功安装在环境中,可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证 ```bash python -c 'import torch;print(torch.__version__)' ``` :::{note} - 如果克隆代码仓库的速度过慢,可以使用以下命令克隆(注意:gitee 的 mmcv 不一定和 github 的保持一致,因为每天只同步一次) ```bash git clone https://gitee.com/open-mmlab/mmcv.git ``` - 如果打算使用 `opencv-python-headless` 而不是 `opencv-python`,例如在一个很小的容器环境或者没有图形用户界面的服务器中,你可以先安装 `opencv-python-headless`,这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。 - 如果编译过程安装依赖库的时间过长,可以[设置 pypi 源](https://mirrors.tuna.tsinghua.edu.cn/help/pypi/) ```bash pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple ``` ::: #### 在 Linux 上编译 mmcv | TODO: 视频教程 1. 克隆代码仓库 ```bash git clone https://github.com/open-mmlab/mmcv.git cd mmcv ``` 2. 安装 `ninja` 和 `psutil` 以加快编译速度 ```bash pip install -r requirements/optional.txt ``` 3. 检查 nvcc 的版本(要求大于等于 9.2,如果没有 GPU,可以跳过) ```bash nvcc --version ``` 上述命令如果输出以下信息,表示 nvcc 的设置没有问题,否则需要设置 CUDA_HOME ``` nvcc: NVIDIA (R) Cuda compiler driver Copyright (c) 2005-2020 NVIDIA Corporation Built on Mon_Nov_30_19:08:53_PST_2020 Cuda compilation tools, release 11.2, V11.2.67 Build cuda_11.2.r11.2/compiler.29373293_0 ``` :::{note} 如果想要支持 ROCm,可以参考 [AMD ROCm](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html) 安装 ROCm。 ::: 4. 检查 gcc 的版本(要求大于等于**5.4**) ```bash gcc --version ``` 5. 开始编译(预估耗时 10 分钟) ```bash pip install -e . -v ``` 6. 验证安装 ```bash python .dev_scripts/check_installation.py ``` 如果上述命令没有报错,说明安装成功。如有报错,请查看[问题解决页面](../faq.html)是否已经有解决方案。 如果没有找到解决方案,欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。 #### 在 macOS 上编译 mmcv | TODO: 视频教程 ```{note} 如果你使用的是搭载 apple silicon 的 mac 设备,请安装 PyTorch 1.13+ 的版本,否则会遇到 [issues#2218](https://github.com/open-mmlab/mmcv/issues/2218) 中的问题。 ``` 1. 克隆代码仓库 ```bash git clone https://github.com/open-mmlab/mmcv.git cd mmcv ``` 2. 安装 `ninja` 和 `psutil` 以加快编译速度 ```bash pip install -r requirements/optional.txt ``` 3. 开始编译 ```bash pip install -e . ``` 4. 验证安装 ```bash python .dev_scripts/check_installation.py ``` 如果上述命令没有报错,说明安装成功。如有报错,请查看[问题解决页面](../faq.md)是否已经有解决方案。 如果没有找到解决方案,欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。 #### 在 Windows 上编译 mmcv | TODO: 视频教程 在 Windows 上编译 mmcv 比 Linux 复杂,本节将一步步介绍如何在 Windows 上编译 mmcv。 ##### 依赖项 请先安装以下的依赖项: - [Git](https://git-scm.com/download/win):安装期间,请选择 **add git to Path** - [Visual Studio Community 2019](https://visualstudio.microsoft.com):用于编译 C++ 和 CUDA 代码 - [Miniconda](https://docs.conda.io/en/latest/miniconda.html):包管理工具 - [CUDA 10.2](https://developer.nvidia.com/cuda-10.2-download-archive):如果只需要 CPU 版本可以不安装 CUDA,安装 CUDA 时,可根据需要进行自定义安装。如果已经安装新版本的显卡驱动,建议取消驱动程序的安装 ```{note} 如果不清楚如何安装以上依赖,请参考[Windows 环境从零安装 mmcv](https://zhuanlan.zhihu.com/p/434491590)。 另外,你需要知道如何在 Windows 上设置变量环境,尤其是 "PATH" 的设置,以下安装过程都会用到。 ``` ##### 通用步骤 1. 从 Windows 菜单启动 Anaconda 命令行 如 Miniconda 安装程序建议,不要使用原始的 `cmd.exe` 或是 `powershell.exe`。命令行有两个版本,一个基于 PowerShell,一个基于传统的 `cmd.exe`。请注意以下说明都是使用的基于 PowerShell 2. 创建一个新的 Conda 环境 ```powershell (base) PS C:\Users\xxx> conda create --name mmcv python=3.7 (base) PS C:\Users\xxx> conda activate mmcv # 确保做任何操作前先激活环境 ``` 3. 安装 PyTorch 时,可以根据需要安装支持 CUDA 或不支持 CUDA 的版本 ```powershell # CUDA version (mmcv) PS C:\Users\xxx> conda install pytorch torchvision cudatoolkit=10.2 -c pytorch # CPU version (mmcv) PS C:\Users\xxx> conda install install pytorch torchvision cpuonly -c pytorch ``` 4. 克隆代码仓库 ```powershell (mmcv) PS C:\Users\xxx> git clone https://github.com/open-mmlab/mmcv.git (mmcv) PS C:\Users\xxx> cd mmcv ``` 5. 安装 `ninja` 和 `psutil` 以加快编译速度 ```powershell (mmcv) PS C:\Users\xxx\mmcv> pip install -r requirements/optional.txt ``` 6. 设置 MSVC 编译器 设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`,则 `cl.exe` 可以在命令行中运行,如下所示。 ```powershell (mmcv) PS C:\Users\xxx\mmcv> cl Microsoft (R) C/C++ Optimizing Compiler Version 19.27.29111 for x64 Copyright (C) Microsoft Corporation. All rights reserved. usage: cl [ option... ] filename... [ / link linkoption... ] ``` 为了兼容性,我们使用 x86-hosted 以及 x64-targeted 版本,即路径中的 `Hostx86\x64` 。 因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本,只有 utf-8 将会被识别,你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。 ##### 编译与安装 mmcv mmcv 有两个版本: - 只包含 CPU 算子的版本 编译 CPU 算子,但只有 x86 将会被编译,并且编译版本只能在 CPU only 情况下运行 - 既包含 CPU 算子,又包含 CUDA 算子的版本 同时编译 CPU 和 CUDA 算子,`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU ###### CPU 版本 编译安装 ```powershell (mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext # 如果成功, cl 将被启动用于编译算子 (mmcv) PS C:\Users\xxx\mmcv> python setup.py develop # 安装 ``` ###### GPU 版本 1. 检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中 ```powershell (mmcv) PS C:\Users\xxx\mmcv> ls env: Name Value ---- ----- CUDA_PATH C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 CUDA_PATH_V10_1 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 CUDA_PATH_V10_2 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 ``` 如果没有,你可以按照下面的步骤设置 ```powershell (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" # 或者 (mmcv) PS C:\Users\xxx\mmcv> $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # CUDA_PATH_V10_2 已经在环境变量中 ``` 2. 设置 CUDA 的目标架构 ```powershell # 这里需要改成你的显卡对应的目标架构 (mmcv) PS C:\Users\xxx\mmcv> $env:TORCH_CUDA_ARCH_LIST="7.5" ``` :::{note} 可以点击 [cuda-gpus](https://developer.nvidia.com/cuda-gpus) 查看 GPU 的计算能力,也可以通过 CUDA 目录下的 deviceQuery.exe 工具查看 ```powershell (mmcv) PS C:\Users\xxx\mmcv> &"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\extras\demo_suite\deviceQuery.exe" Device 0: "NVIDIA GeForce GTX 1660 SUPER" CUDA Driver Version / Runtime Version 11.7 / 11.1 CUDA Capability Major/Minor version number: 7.5 ``` 上面的 7.5 表示目标架构。注意:需把上面命令的 v10.2 换成你的 CUDA 版本。 ::: 3. 编译安装 ```powershell (mmcv) PS C:\Users\xxx\mmcv> python setup.py build_ext # 如果成功, cl 将被启动用于编译算子 (mmcv) PS C:\Users\xxx\mmcv> python setup.py develop # 安装 ``` ```{note} 如果你的 PyTorch 版本是 1.6.0,你可能会遇到一些 [issue](https://github.com/pytorch/pytorch/issues/42467) 提到的错误,你可以参考这个 [pull request](https://github.com/pytorch/pytorch/pull/43380/files) 修改本地环境的 PyTorch 源代码 ``` ##### 验证安装 ```powershell (mmcv) PS C:\Users\xxx\mmcv> python .dev_scripts/check_installation.py ``` 如果上述命令没有报错,说明安装成功。如有报错,请查看[问题解决页面](../faq.md)是否已经有解决方案。 如果没有找到解决方案,欢迎提 [issue](https://github.com/open-mmlab/mmcv/issues)。 ### 编译 mmcv-lite 如果你需要使用和 PyTorch 相关的模块,请确保 PyTorch 已经成功安装在环境中,可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。 1. 克隆代码仓库 ```bash git clone https://github.com/open-mmlab/mmcv.git cd mmcv ``` 2. 开始编译 ```bash MMCV_WITH_OPS=0 pip install -e . -v ``` 3. 验证安装 ```bash python -c 'import mmcv;print(mmcv.__version__)' ``` ### 在寒武纪 MLU 机器编译 mmcv-full #### 安装 torch_mlu ##### 选项1: 基于寒武纪 docker image 安装 首先请下载并且拉取寒武纪 docker (请向 service@cambricon.com 发邮件以获得最新的寒武纪 pytorch 发布 docker)。 ``` docker pull ${docker image} ``` 进入 docker, [编译 MMCV MLU](#编译mmcv-mlu) 并[进行验证](#验证是否成功安装)。 ##### 选项2:基于 cambricon pytorch 源码编译安装 请向 service@cambricon.com 发送邮件或联系 Cambricon 工程师以获取合适版本的 CATCH 软件包,在您获得合适版本的 CATCH 软件包后,请参照 ${CATCH-path}/CONTRIBUTING.md 中的步骤安装 CATCH。 #### 编译 MMCV 克隆代码仓库 ```bash git clone https://github.com/open-mmlab/mmcv.git ``` 算子库 mlu-ops 在编译 MMCV 时自动下载到默认路径(mmcv/mlu-ops),你也可以在编译前设置环境变量 MMCV_MLU_OPS_PATH 指向已经存在的 mlu-ops 算子库路径。 ```bash export MMCV_MLU_OPS_PATH=/xxx/xxx/mlu-ops ``` 开始编译 ```bash cd mmcv export MMCV_WITH_OPS=1 export FORCE_MLU=1 python setup.py install ``` #### 验证是否成功安装 完成上述安装步骤之后,您可以尝试运行下面的 Python 代码以测试您是否成功在 MLU 设备上安装了 mmcv-full ```python import torch import torch_mlu from mmcv.ops import sigmoid_focal_loss x = torch.randn(3, 10).mlu() x.requires_grad = True y = torch.tensor([1, 5, 3]).mlu() w = torch.ones(10).float().mlu() output = sigmoid_focal_loss(x, y, 2.0, 0.25, w, 'none') ``` ### 在昇腾 NPU 机器编译 mmcv 在编译 mmcv 前,需要安装 torch_npu,完整安装教程详见 [PyTorch 安装指南](https://gitee.com/ascend/pytorch/blob/master/docs/zh/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97/PyTorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97.md#pytorch%E5%AE%89%E8%A3%85%E6%8C%87%E5%8D%97) #### 选项 1: 使用 NPU 设备源码编译安装 mmcv (推荐方式) - 拉取 [MMCV 源码](https://github.com/open-mmlab/mmcv.git) ```bash git pull https://github.com/open-mmlab/mmcv.git ``` - 编译 ```bash MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_NPU=1 python setup.py build_ext ``` - 安装 ```bash MMCV_WITH_OPS=1 FORCE_NPU=1 python setup.py develop ``` #### 选项 2: 使用 pip 安装 Ascend 编译版本的 mmcv Ascend 编译版本的 mmcv 在 mmcv >= 1.7.0 时已经支持直接 pip 安装 ```bash pip install mmcv -f https://download.openmmlab.com/mmcv/dist/ascend/torch1.8.0/index.html ``` #### 验证 ```python import torch import torch_npu from mmcv.ops import softmax_focal_loss # Init tensor to the NPU x = torch.randn(3, 10).npu() y = torch.tensor([1, 5, 3]).npu() w = torch.ones(10).float().npu() output = softmax_focal_loss(x, y, 2.0, 0.25, w, 'none') print(output) ``` ================================================ FILE: docs/zh_cn/get_started/installation.md ================================================ ## 安装 MMCV MMCV 有两个版本: - **mmcv**: 完整版,包含所有的特性以及丰富的开箱即用的 CPU 和 CUDA 算子。注意,完整版本可能需要更长时间来编译。 - **mmcv-lite**: 精简版,不包含 CPU 和 CUDA 算子但包含其余所有特性和功能,类似 MMCV 1.0 之前的版本。如果你不需要使用算子的话,精简版可以作为一个考虑选项。 ```{warning} 请不要在同一个环境中安装两个版本,否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前,需要先卸载另一个。`如果 CUDA 可用,强烈推荐安装 mmcv`。 ``` ### 安装 mmcv 在安装 mmcv 之前,请确保 PyTorch 已经成功安装在环境中,可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。可使用以下命令验证 ```bash python -c 'import torch;print(torch.__version__)' ``` 如果输出版本信息,则表示 PyTorch 已安装。 #### 使用 mim 安装(推荐) [mim](https://github.com/open-mmlab/mim) 是 OpenMMLab 项目的包管理工具,使用它可以很方便地安装 mmcv。 ```bash pip install -U openmim mim install mmcv ``` 如果发现上述的安装命令没有使用预编译包(以 `.whl` 结尾)而是使用源码包(以 `.tar.gz` 结尾)安装,则有可能是我们没有提供和当前环境的 PyTorch 版本、CUDA 版本相匹配的 mmcv 预编译包,此时,你可以[源码安装 mmcv](build.md)。
使用预编译包的安装日志 Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
Collecting mmcv
Downloading https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/mmcv-2.0.0-cp38-cp38-manylinux1_x86_64.whl
使用源码包的安装日志 Looking in links: https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
Collecting mmcv==2.0.0
Downloading mmcv-2.0.0.tar.gz
如需安装指定版本的 mmcv,例如安装 2.0.0 版本的 mmcv,可使用以下命令 ```bash mim install mmcv==2.0.0 ``` :::{note} 如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`,例如在一个很小的容器环境或者没有图形用户界面的服务器中,你可以先安装 `opencv-python-headless`,这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。 另外,如果安装依赖库的时间过长,可以指定 pypi 源 ```bash mim install "mmcv>=2.0.0rc1" -i https://pypi.tuna.tsinghua.edu.cn/simple ``` ::: 安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。 #### 使用 pip 安装 使用以下命令查看 CUDA 和 PyTorch 的版本 ```bash python -c 'import torch;print(torch.__version__);print(torch.version.cuda)' ``` 根据系统的类型、CUDA 版本、PyTorch 版本以及 MMCV 版本选择相应的安装命令





如果在上面的下拉框中没有找到对应的版本,则可能是没有对应 PyTorch 或者 CUDA 或者 mmcv 版本的预编译包,此时,你可以[源码安装 mmcv](build.md)。

:::{note}
PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的,故 mmcv 只提供 1.x.0 的编译包。如果你
的 PyTorch 版本是 1.x.1,你可以放心地安装在 1.x.0 版本编译的 mmcv。例如,如果你的
PyTorch 版本是 1.8.1,你可以放心选择 1.8.x。
:::

:::{note}
如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`,例如在一个很小的容器环境或者没有图形用户界面的服务器中,你可以先安装 `opencv-python-headless`,这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`。

另外,如果安装依赖库的时间过长,可以指定 pypi 源

```bash
pip install mmcv -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html -i https://pypi.tuna.tsinghua.edu.cn/simple
```

:::

安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/blob/main/.dev_scripts/check_installation.py) 脚本检查 mmcv 是否安装成功。

#### 使用 docker 镜像

先将算法库克隆到本地再构建镜像

```bash
git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
docker build -t mmcv -f docker/release/Dockerfile .
```

也可以直接使用下面的命令构建镜像

```bash
docker build -t mmcv https://github.com/open-mmlab/mmcv.git#main:docker/release
```

[Dockerfile](release/Dockerfile) 默认安装最新的 mmcv,如果你想要指定版本,可以使用下面的命令

```bash
docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=2.0.0 .
```

如果你想要使用其他版本的 PyTorch 和 CUDA,你可以在构建镜像时指定它们的版本。

例如指定 PyTorch 的版本是 1.11,CUDA 的版本是 11.3

```bash
docker build -t mmcv -f docker/release/Dockerfile \
    --build-arg PYTORCH=1.11.0 \
    --build-arg CUDA=11.3 \
    --build-arg CUDNN=8 \
    --build-arg MMCV=2.0.0 .
```

更多 PyTorch 和 CUDA 镜像可以点击 [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags) 查看。

### 安装 mmcv-lite

如果你需要使用和 PyTorch 相关的模块,请确保 PyTorch 已经成功安装在环境中,可以参考 [PyTorch 官方安装文档](https://pytorch.org/get-started/locally/#start-locally)。

```python
pip install mmcv-lite
```


================================================
FILE: docs/zh_cn/get_started/introduction.md
================================================
## 介绍 MMCV

MMCV 是一个面向计算机视觉的基础库,它提供了以下功能:

- [图像和视频处理](../understand_mmcv/data_process.md)
- [图像和标注结果可视化](../understand_mmcv/visualization.md)
- [图像变换](../understand_mmcv/data_transform.md)
- [多种 CNN 网络结构](../understand_mmcv/cnn.md)
- [高质量实现的常见 CUDA 算子](../understand_mmcv/ops.md)

MMCV 支持多种平台,包括:

- Linux
- Windows
- macOS

它支持的 OpenMMLab 项目:

- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架


================================================
FILE: docs/zh_cn/get_started/previous_versions.md
================================================
## 其他版本的 PyTorch

我们不再提供在较低的 `PyTorch` 版本下编译的 `mmcv-full` 包,但为了您的方便,您可以在下面找到它们。

### PyTorch 1.4

| 1.0.0 \<= mmcv_version \<= 1.2.1

#### CUDA 10.1

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
```

#### CUDA 9.2

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.4.0/index.html
```

#### CPU

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.4.0/index.html
```

### PyTorch v1.3

| 1.0.0 \<= mmcv_version \<= 1.3.16

#### CUDA 10.1

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
```

#### CUDA 9.2

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.3.0/index.html
```

#### CPU

```bash
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.3.0/index.html
```


================================================
FILE: docs/zh_cn/index.rst
================================================
欢迎来到 MMCV 的中文文档!
=============================

您可以在页面左下角切换中英文文档。

.. toctree::
   :maxdepth: 2
   :caption: 介绍与安装

   get_started/introduction.md
   get_started/installation.md
   get_started/build.md
   get_started/article.md
   get_started/api_reference.md

.. toctree::
   :maxdepth: 2
   :caption: 深入理解 MMCV

   understand_mmcv/data_process.md
   understand_mmcv/data_transform.md
   understand_mmcv/visualization.md
   understand_mmcv/cnn.md
   understand_mmcv/ops.md

.. toctree::
   :caption: 语言切换

   switch_language.md

.. toctree::
   :maxdepth: 2
   :caption: 兼容性

   compatibility.md

.. toctree::

   faq.md

.. toctree::
   :maxdepth: 2
   :caption: 社区

   community/contributing.md
   community/pr.md
   community/code_style.md

.. toctree::
   :maxdepth: 1
   :caption: API 文档

   mmcv.image 
   mmcv.video 
   mmcv.visualization 
   mmcv.cnn 
   mmcv.ops 
   mmcv.transforms 
   mmcv.arraymisc 
   mmcv.utils 


Indices and tables
==================

* :ref:`genindex`
* :ref:`search`


================================================
FILE: docs/zh_cn/make.bat
================================================
@ECHO OFF

pushd %~dp0

REM Command file for Sphinx documentation

if "%SPHINXBUILD%" == "" (
	set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=.
set BUILDDIR=_build

if "%1" == "" goto help

%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
	echo.
	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
	echo.installed, then set the SPHINXBUILD environment variable to point
	echo.to the full path of the 'sphinx-build' executable. Alternatively you
	echo.may add the Sphinx directory to PATH.
	echo.
	echo.If you don't have Sphinx installed, grab it from
	echo.http://sphinx-doc.org/
	exit /b 1
)

%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end

:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%

:end
popd


================================================
FILE: docs/zh_cn/switch_language.md
================================================
## English

## 简体中文


================================================
FILE: docs/zh_cn/understand_mmcv/cnn.md
================================================
## 卷积神经网络

我们为卷积神经网络提供了一些构建模块,包括层构建、模块组件和权重初始化。

### 网络层的构建

在运行实验时,我们可能需要尝试同属一种类型但不同配置的层,但又不希望每次都修改代码。于是我们提供一些层构建方法,可以从字典构建层,字典可以在配置文件中配置,也可以通过命令行参数指定。

#### 用法

一个简单的例子:

```python
from mmcv.cnn import build_conv_layer

cfg = dict(type='Conv3d')
layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
```

- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名)
- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN(IN是IN2d的别名)
- `build_activation_layer`:支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU
- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle
- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate

#### 拓展

我们还允许自定义层和算子来扩展构建方法。

1. 编写和注册自己的模块:

   ```python
   from mmengine.registry import MODELS

   @MODELS.register_module()
   class MyUpsample:

       def __init__(self, scale_factor):
           pass

       def forward(self, x):
           pass
   ```

2. 在某处导入 `MyUpsample` (例如 `__init__.py` )然后使用它:

   ```python
   from mmcv.cnn import build_upsample_layer

   cfg = dict(type='MyUpsample', scale_factor=2)
   layer = build_upsample_layer(cfg)
   ```

### 模块组件

我们还提供了常用的模块组件,以方便网络构建。
卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成,更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。

```python
from mmcv.cnn import ConvModule

# conv + bn + relu
conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
# conv + gn + relu
conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
# conv + relu
conv = ConvModule(3, 8, 2)
# conv
conv = ConvModule(3, 8, 2, act_cfg=None)
# conv + leaky relu
conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
# bn + conv + relu
conv = ConvModule(
    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
```


================================================
FILE: docs/zh_cn/understand_mmcv/data_process.md
================================================
## 数据处理

### 图像

图像模块提供了一些图像预处理的函数,该模块依赖 `opencv` 。

#### 读取/保存/显示

使用 `imread` 和 `imwrite` 函数可以读取和保存图像。

```python
import mmcv

img = mmcv.imread('test.jpg')
img = mmcv.imread('test.jpg', flag='grayscale')
img_ = mmcv.imread(img)  # 相当于什么也没做
mmcv.imwrite(img, 'out.jpg')
```

从二进制中读取图像

```python
with open('test.jpg', 'rb') as f:
    data = f.read()
img = mmcv.imfrombytes(data)
```

显示图像文件或已读取的图像

```python
mmcv.imshow('tests/data/color.jpg')

for i in range(10):
    img = np.random.randint(256, size=(100, 100, 3), dtype=np.uint8)
    mmcv.imshow(img, win_name='test image', wait_time=200)
```

#### 色彩空间转换

支持的转换函数:

- bgr2gray
- gray2bgr
- bgr2rgb
- rgb2bgr
- bgr2hsv
- hsv2bgr

```python
img = mmcv.imread('tests/data/color.jpg')
img1 = mmcv.bgr2rgb(img)
img2 = mmcv.rgb2gray(img1)
img3 = mmcv.bgr2hsv(img)
```

#### 缩放

有三种缩放图像的方法。所有以 `imresize_*` 开头的函数都有一个 `return_scale` 参数,如果
该参数为 `False` ,函数的返回值只有调整之后的图像,否则是一个元组 `(resized_img, scale)` 。

```python
# 缩放图像至给定的尺寸
mmcv.imresize(img, (1000, 600), return_scale=True)

# 缩放图像至与给定的图像同样的尺寸
mmcv.imresize_like(img, dst_img, return_scale=False)

# 以一定的比例缩放图像
mmcv.imrescale(img, 0.5)

# 缩放图像至最长的边不大于1000、最短的边不大于800并且没有改变图像的长宽比
mmcv.imrescale(img, (1000, 800))
```

#### 旋转

我们可以使用 `imrotate` 旋转图像一定的角度。旋转的中心需要指定,默认值是原始图像的中心。有
两种旋转的模式,一种保持图像的尺寸不变,因此旋转后原始图像中的某些部分会被裁剪,另一种是扩大
图像的尺寸进而保留完整的原始图像。

```python
img = mmcv.imread('tests/data/color.jpg')

# 顺时针旋转图像30度
img_ = mmcv.imrotate(img, 30)

# 逆时针旋转图像90度
img_ = mmcv.imrotate(img, -90)

# 顺时针旋转图像30度并且缩放图像为原始图像的1.5倍
img_ = mmcv.imrotate(img, 30, scale=1.5)

# 以坐标(100, 100)为中心顺时针旋转图像30度
img_ = mmcv.imrotate(img, 30, center=(100, 100))

# 顺时针旋转图像30度并扩大图像的尺寸
img_ = mmcv.imrotate(img, 30, auto_bound=True)
```

#### 翻转

我们可以使用 `imflip` 翻转图像。

```python
img = mmcv.imread('tests/data/color.jpg')

# 水平翻转图像
mmcv.imflip(img)

# 垂直翻转图像
mmcv.imflip(img, direction='vertical')
```

#### 裁剪

`imcrop` 可以裁剪图像的一个或多个区域,每个区域用左上角和右下角坐标表示,形如(x1, y1, x2, y2)

```python
import mmcv
import numpy as np

img = mmcv.imread('tests/data/color.jpg')

# 裁剪区域 (10, 10, 100, 120)
bboxes = np.array([10, 10, 100, 120])
patch = mmcv.imcrop(img, bboxes)

# 裁剪两个区域,分别是 (10, 10, 100, 120) 和 (0, 0, 50, 50)
bboxes = np.array([[10, 10, 100, 120], [0, 0, 50, 50]])
patches = mmcv.imcrop(img, bboxes)

# 裁剪两个区域并且缩放区域1.2倍
patches = mmcv.imcrop(img, bboxes, scale=1.2)
```

#### 填充

`impad` and `impad_to_multiple` 可以用给定的值将图像填充至给定的尺寸。

```python
img = mmcv.imread('tests/data/color.jpg')

# 用给定值将图像填充至 (1000, 1200)
img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=0)

# 用给定值分别填充图像的3个通道至 (1000, 1200)
img_ = mmcv.impad(img, shape=(1000, 1200), pad_val=(100, 50, 200))

# 用给定值填充图像的左、右、上、下四条边
img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=0)

# 用3个值分别填充图像的左、右、上、下四条边的3个通道
img_ = mmcv.impad(img, padding=(10, 20, 30, 40), pad_val=(100, 50, 200))

# 将图像的四条边填充至能够被给定值整除
img_ = mmcv.impad_to_multiple(img, 32)
```

### 视频

视频模块提供了以下的功能:

- 一个 `VideoReader` 类,具有友好的 API 接口可以读取和转换视频
- 一些编辑视频的方法,包括 `cut` , `concat` , `resize`
- 光流的读取/保存/变换

#### VideoReader

`VideoReader` 类提供了和序列一样的接口去获取视频帧。该类会缓存所有被访问过的帧。

```python
video = mmcv.VideoReader('test.mp4')

# 获取基本的信息
print(len(video))
print(video.width, video.height, video.resolution, video.fps)

# 遍历所有的帧
for frame in video:
    print(frame.shape)

# 读取下一帧
img = video.read()

# 使用索引获取帧
img = video[100]

# 获取指定范围的帧
img = video[5:10]
```

将视频切成帧并保存至给定目录或者从给定目录中生成视频。

```python
# 将视频切成帧并保存至目录
video = mmcv.VideoReader('test.mp4')
video.cvt2frames('out_dir')

# 从给定目录中生成视频
mmcv.frames2video('out_dir', 'test.avi')
```

#### 编辑函数

有几个用于编辑视频的函数,这些函数是对 `ffmpeg` 的封装。

```python
# 裁剪视频
mmcv.cut_video('test.mp4', 'clip1.mp4', start=3, end=10, vcodec='h264')

# 将多个视频拼接成一个视频
mmcv.concat_video(['clip1.mp4', 'clip2.mp4'], 'joined.mp4', log_level='quiet')

# 将视频缩放至给定的尺寸
mmcv.resize_video('test.mp4', 'resized1.mp4', (360, 240))

# 将视频缩放至给定的倍率
mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)
```

#### 光流

`mmcv` 提供了以下用于操作光流的函数:

- 读取/保存
- 可视化
- 流变换

我们提供了两种将光流dump到文件的方法,分别是非压缩和压缩的方法。非压缩的方法直接将浮点数值的光流
保存至二进制文件,虽然光流无损但文件会比较大。而压缩的方法先量化光流至 0-255 整形数值再保存为
jpeg图像。光流的x维度和y维度会被拼接到图像中。

1. 读取/保存

```python
flow = np.random.rand(800, 600, 2).astype(np.float32)
# 保存光流到flo文件 (~3.7M)
mmcv.flowwrite(flow, 'uncompressed.flo')
# 保存光流为jpeg图像 (~230K),图像的尺寸为 (800, 1200)
mmcv.flowwrite(flow, 'compressed.jpg', quantize=True, concat_axis=1)

# 读取光流文件,以下两种方式读取的光流尺寸均为 (800, 600, 2)
flow = mmcv.flowread('uncompressed.flo')
flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)
```

2. 可视化

使用 `mmcv.flowshow()` 可视化光流

```python
mmcv.flowshow(flow)
```

![progress](../../en/_static/flow_visualization.png)

1. 流变换

```python
img1 = mmcv.imread('img1.jpg')
flow = mmcv.flowread('flow.flo')
warped_img2 = mmcv.flow_warp(img1, flow)
```

img1 (左) and img2 (右)

![raw images](../../en/_static/flow_raw_images.png)

光流 (img2 -> img1)

![optical flow](../../en/_static/flow_img2toimg1.png)

变换后的图像和真实图像的差异

![warped image](../../en/_static/flow_warp_diff.png)


================================================
FILE: docs/zh_cn/understand_mmcv/data_transform.md
================================================
# 数据变换

在 OpenMMLab 算法库中,数据集的构建和数据的准备是相互解耦的。通常,数据集的构建只对数据集进行解析,记录每个样本的基本信息;而数据的准备则是通过一系列的数据变换,根据样本的基本信息进行数据加载、预处理、格式化等操作。

## 数据变换的设计

在 MMCV 中,我们使用各种可调用的数据变换类来进行数据的操作。这些数据变换类可以接受若干配置参数进行实例化,之后通过调用的方式对输入的数据字典进行处理。同时,我们约定所有数据变换都接受一个字典作为输入,并将处理后的数据输出为一个字典。一个简单的例子如下:

```python
>>> import numpy as np
>>> from mmcv.transforms import Resize
>>>
>>> transform = Resize(scale=(224, 224))
>>> data_dict = {'img': np.random.rand(256, 256, 3)}
>>> data_dict = transform(data_dict)
>>> print(data_dict['img'].shape)
(224, 224, 3)
```

数据变换类会读取输入字典的某些字段,并且可能添加、或者更新某些字段。这些字段的键大部分情况下是固定的,如 `Resize` 会固定地读取输入字典中的 `"img"` 等字段。我们可以在对应类的文档中了解对输入输出字段的约定。

```{note}
默认情况下,在需要图像尺寸作为**初始化参数**的数据变换 (如Resize, Pad) 中,图像尺寸的顺序均为 (width, height)。在数据变换**返回的字典**中,图像相关的尺寸, 如 `img_shape`、`ori_shape`、`pad_shape` 等,均为 (height, width)。
```

MMCV 为所有的数据变换类提供了一个统一的基类 (`BaseTransform`):

```python
class BaseTransform(metaclass=ABCMeta):

    def __call__(self, results: dict) -> dict:

        return self.transform(results)

    @abstractmethod
    def transform(self, results: dict) -> dict:
        pass
```

所有的数据变换类都需要继承 `BaseTransform`,并实现 `transform` 方法。`transform` 方法的输入和输出均为一个字典。在**自定义数据变换类**一节中,我们会更详细地介绍如何实现一个数据变换类。

## 数据流水线

如上所述,所有数据变换的输入和输出都是一个字典,而且根据 OpenMMLab 中 [有关数据集的约定](TODO),数据集中每个样本的基本信息都是一个字典。这样一来,我们可以将所有的数据变换操作首尾相接,组合成为一条数据流水线(data pipeline),输入数据集中样本的信息字典,输出完成一系列处理后的信息字典。

以分类任务为例,我们在下图展示了一个典型的数据流水线。对每个样本,数据集中保存的基本信息是一个如图中最左侧所示的字典,之后每经过一个由蓝色块代表的数据变换操作,数据字典中都会加入新的字段(标记为绿色)或更新现有的字段(标记为橙色)。

在配置文件中,数据流水线是一个若干数据变换配置字典组成的列表,每个数据集都需要设置参数 `pipeline` 来定义该数据集需要进行的数据准备操作。如上数据流水线在配置文件中的配置如下: ```python pipeline = [ dict(type='LoadImageFromFile'), dict(type='Resize', size=256, keep_ratio=True), dict(type='CenterCrop', crop_size=224), dict(type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]), dict(type='ClsFormatBundle') ] dataset = dict( ... pipeline=pipeline, ... ) ``` ## 常用的数据变换类 按照功能,常用的数据变换类可以大致分为数据加载、数据预处理与增强、数据格式化。在 MMCV 中,我们提供了一些常用的数据变换类如下: ### 数据加载 为了支持大规模数据集的加载,通常在 `Dataset` 初始化时不加载数据,只加载相应的路径。因此需要在数据流水线中进行具体数据的加载。 | class | 功能 | | :-------------------------: | :---------------------------------------: | | [`LoadImageFromFile`](TODO) | 根据路径加载图像 | | [`LoadAnnotations`](TODO) | 加载和组织标注信息,如 bbox、语义分割图等 | ### 数据预处理及增强 数据预处理和增强通常是对图像本身进行变换,如裁剪、填充、缩放等。 | class | 功能 | | :------------------------------: | :--------------------------------: | | [`Pad`](TODO) | 填充图像边缘 | | [`CenterCrop`](TODO) | 居中裁剪 | | [`Normalize`](TODO) | 对图像进行归一化 | | [`Resize`](TODO) | 按照指定尺寸或比例缩放图像 | | [`RandomResize`](TODO) | 缩放图像至指定范围的随机尺寸 | | [`RandomMultiscaleResize`](TODO) | 缩放图像至多个尺寸中的随机一个尺寸 | | [`RandomGrayscale`](TODO) | 随机灰度化 | | [`RandomFlip`](TODO) | 图像随机翻转 | | [`MultiScaleFlipAug`](TODO) | 支持缩放和翻转的测试时数据增强 | ### 数据格式化 数据格式化操作通常是对数据进行的类型转换。 | class | 功能 | | :---------------------: | :-------------------------------: | | [`ToTensor`](TODO) | 将指定的数据转换为 `torch.Tensor` | | [`ImageToTensor`](TODO) | 将图像转换为 `torch.Tensor` | ## 自定义数据变换类 要实现一个新的数据变换类,需要继承 `BaseTransform`,并实现 `transform` 方法。这里,我们使用一个简单的翻转变换(`MyFlip`)作为示例: ```python import random import mmcv from mmcv.transforms import BaseTransform, TRANSFORMS @TRANSFORMS.register_module() class MyFlip(BaseTransform): def __init__(self, direction: str): super().__init__() self.direction = direction def transform(self, results: dict) -> dict: img = results['img'] results['img'] = mmcv.imflip(img, direction=self.direction) return results ``` 从而,我们可以实例化一个 `MyFlip` 对象,并将之作为一个可调用对象,来处理我们的数据字典。 ```python import numpy as np transform = MyFlip(direction='horizontal') data_dict = {'img': np.random.rand(224, 224, 3)} data_dict = transform(data_dict) processed_img = data_dict['img'] ``` 又或者,在配置文件的 pipeline 中使用 `MyFlip` 变换 ```python pipeline = [ ... dict(type='MyFlip', direction='horizontal'), ... ] ``` 需要注意的是,如需在配置文件中使用,需要保证 `MyFlip` 类所在的文件在运行时能够被导入。 ## 变换包装 变换包装是一种特殊的数据变换类,他们本身并不操作数据字典中的图像、标签等信息,而是对其中定义的数据变换的行为进行增强。 ### 字段映射(KeyMapper) 字段映射包装(`KeyMapper`)用于对数据字典中的字段进行映射。例如,一般的图像处理变换都从数据字典中的 `"img"` 字段获得值。但有些时候,我们希望这些变换处理数据字典中其他字段中的图像,比如 `"gt_img"` 字段。 如果配合注册器和配置文件使用的话,在配置文件中数据集的 `pipeline` 中如下例使用字段映射包装: ```python pipeline = [ ... dict(type='KeyMapper', mapping={ 'img': 'gt_img', # 将 "gt_img" 字段映射至 "img" 字段 'mask': ..., # 不使用原始数据中的 "mask" 字段。即对于被包装的数据变换,数据中不包含 "mask" 字段 }, auto_remap=True, # 在完成变换后,将 "img" 重映射回 "gt_img" 字段 transforms=[ # 在 `RandomFlip` 变换类中,我们只需要操作 "img" 字段即可 dict(type='RandomFlip'), ]) ... ] ``` 利用字段映射包装,我们在实现数据变换类时,不需要考虑在 `transform` 方法中考虑各种可能的输入字段名,只需要处理默认的字段即可。 ### 随机选择(RandomChoice)和随机执行(RandomApply) 随机选择包装(`RandomChoice`)用于从一系列数据变换组合中随机应用一个数据变换组合。利用这一包装,我们可以简单地实现一些数据增强功能,比如 AutoAugment。 如果配合注册器和配置文件使用的话,在配置文件中数据集的 `pipeline` 中如下例使用随机选择包装: ```python pipeline = [ ... dict(type='RandomChoice', transforms=[ [ dict(type='Posterize', bits=4), dict(type='Rotate', angle=30.) ], # 第一种随机变化组合 [ dict(type='Equalize'), dict(type='Rotate', angle=30) ], # 第二种随机变换组合 ], prob=[0.4, 0.6] # 两种随机变换组合各自的选用概率 ) ... ] ``` 随机执行包装(`RandomApply`)用于以指定概率随机执行数据变换组合。例如: ```python pipeline = [ ... dict(type='RandomApply', transforms=[dict(type='Rotate', angle=30.)], prob=0.3) # 以 0.3 的概率执行被包装的数据变换 ... ] ``` ### 多目标扩展(TransformBroadcaster) 通常,一个数据变换类只会从一个固定的字段读取操作目标。虽然我们也可以使用 `KeyMapper` 来改变读取的字段,但无法将变换一次性应用于多个字段的数据。为了实现这一功能,我们需要借助多目标扩展包装(`TransformBroadcaster`)。 多目标扩展包装(`TransformBroadcaster`)有两个用法,一是将数据变换作用于指定的多个字段,二是将数据变换作用于某个字段下的一组目标中。 1. 应用于多个字段 假设我们需要将数据变换应用于 `"lq"` (low-quality) 和 `"gt"` (ground-truth) 两个字段中的图像上。 ```python pipeline = [ dict(type='TransformBroadcaster', # 分别应用于 "lq" 和 "gt" 两个字段,并将二者应设置 "img" 字段 mapping={'img': ['lq', 'gt']}, # 在完成变换后,将 "img" 字段重映射回原先的字段 auto_remap=True, # 是否在对各目标的变换中共享随机变量 # 更多介绍参加后续章节(随机变量共享) share_random_params=True, transforms=[ # 在 `RandomFlip` 变换类中,我们只需要操作 "img" 字段即可 dict(type='RandomFlip'), ]) ] ``` 在多目标扩展的 `mapping` 设置中,我们同样可以使用 `...` 来忽略指定的原始字段。如以下例子中,被包裹的 `RandomCrop` 会对字段 `"img"` 中的图像进行裁剪,并且在字段 `"img_shape"` 存在时更新剪裁后的图像大小。如果我们希望同时对两个图像字段 `"lq"` 和 `"gt"` 进行相同的随机裁剪,但只更新一次 `"img_shape"` 字段,可以通过例子中的方式实现: ```python pipeline = [ dict(type='TransformBroadcaster', mapping={ 'img': ['lq', 'gt'], 'img_shape': ['img_shape', ...], }, # 在完成变换后,将 "img" 和 "img_shape" 字段重映射回原先的字段 auto_remap=True, # 是否在对各目标的变换中共享随机变量 # 更多介绍参加后续章节(随机变量共享) share_random_params=True, transforms=[ # `RandomCrop` 类中会操作 "img" 和 "img_shape" 字段。若 "img_shape" 空缺, # 则只操作 "img" dict(type='RandomCrop'), ]) ] ``` 2. 应用于一个字段的一组目标 假设我们需要将数据变换应用于 `"images"` 字段,该字段为一个图像组成的 list。 ```python pipeline = [ dict(type='TransformBroadcaster', # 将 "images" 字段下的每张图片映射至 "img" 字段 mapping={'img': 'images'}, # 在完成变换后,将 "img" 字段下的图片重映射回 "images" 字段的列表中 auto_remap=True, # 是否在对各目标的变换中共享随机变量 share_random_params=True, transforms=[ # 在 `RandomFlip` 变换类中,我们只需要操作 "img" 字段即可 dict(type='RandomFlip'), ]) ] ``` #### 装饰器 `cache_randomness` 在 `TransformBroadcaster` 中,我们提供了 `share_random_params` 选项来支持在多次数据变换中共享随机状态。例如,在超分辨率任务中,我们希望将随机变换**同步**作用于低分辨率图像和原始图像。如果我们希望在自定义的数据变换类中使用这一功能,需要在类中标注哪些随机变量是支持共享的。这可以通过装饰器 `cache_randomness` 来实现。 以上文中的 `MyFlip` 为例,我们希望以一定的概率随机执行翻转: ```python from mmcv.transforms.utils import cache_randomness @TRANSFORMS.register_module() class MyRandomFlip(BaseTransform): def __init__(self, prob: float, direction: str): super().__init__() self.prob = prob self.direction = direction @cache_randomness # 标注该方法的输出为可共享的随机变量 def do_flip(self): flip = True if random.random() > self.prob else False return flip def transform(self, results: dict) -> dict: img = results['img'] if self.do_flip(): results['img'] = mmcv.imflip(img, direction=self.direction) return results ``` 在上面的例子中,我们用`cache_randomness` 装饰 `do_flip`方法,即将该方法返回值 `flip` 标注为一个支持共享的随机变量。进而,在 `TransformBroadcaster` 对多个目标的变换中,这一变量的值都会保持一致。 #### 装饰器 `avoid_cache_randomness` 在一些情况下,我们无法将数据变换中产生随机变量的过程单独放在类方法中。例如数据变换中使用的来自第三方库的模块,这些模块将随机变量相关的部分封装在了内部,导致无法将其抽出为数据变换的类方法。这样的数据变换无法通过装饰器 `cache_randomness` 标注支持共享的随机变量,进而无法在多目标扩展时共享随机变量。 为了避免在多目标扩展中误用此类数据变换,我们提供了另一个装饰器 `avoid_cache_randomness`,用来对此类数据变换进行标记: ```python from mmcv.transforms.utils import avoid_cache_randomness @TRANSFORMS.register_module() @avoid_cache_randomness class MyRandomTransform(BaseTransform): def transform(self, results: dict) -> dict: ... ``` 用 `avoid_cache_randomness` 标记的数据变换类,当其实例被 `TransformBroadcaster` 包装且将参数 `share_random_params` 设置为 True 时,会抛出异常,以此提醒用户不能这样使用。 在使用 `avoid_cache_randomness` 时需要注意以下几点: 1. `avoid_cache_randomness` 只用于装饰数据变换类(BaseTransfrom 的子类),而不能用与装饰其他一般的类、类方法或函数 2. 被 `avoid_cache_randomness` 修饰的数据变换作为基类时,其子类将**不会继承**这一特性。如果子类仍无法共享随机变量,则应再次使用 `avoid_cache_randomness` 修饰 3. 只有当一个数据变换具有随机性,且无法共享随机参数时,才需要以 `avoid_cache_randomness` 修饰。无随机性的数据变换不需要修饰 ================================================ FILE: docs/zh_cn/understand_mmcv/ops.md ================================================ ## 算子 MMCV 提供了检测、分割等任务中常用的算子 | Device | CPU | CUDA | MLU | MPS | Ascend | | ---------------------------- | --- | ---- | --- | --- | ------ | | ActiveRotatedFilter | √ | √ | | | √ | | AssignScoreWithK | | √ | | | | | BallQuery | | √ | √ | | √ | | BBoxOverlaps | | √ | √ | √ | √ | | BorderAlign | | √ | | | | | BoxIouRotated | √ | √ | √ | | √ | | BoxIouQuadri | √ | √ | | | | | CARAFE | | √ | √ | | | | ChamferDistance | | √ | | | √ | | CrissCrossAttention | | √ | | | | | ContourExpand | √ | | | | | | ConvexIoU | | √ | | | | | CornerPool | | √ | | | | | Correlation | | √ | | | | | Deformable Convolution v1/v2 | √ | √ | √ | | √ | | Deformable RoIPool | | √ | √ | | √ | | DiffIoURotated | | √ | √ | | | | DynamicScatter | | √ | √ | | | | FurthestPointSample | | √ | | | √ | | FurthestPointSampleWithDist | | √ | | | √ | | FusedBiasLeakyrelu | | √ | | | √ | | GatherPoints | | √ | | | √ | | GroupPoints | | √ | | | | | Iou3d | | √ | √ | | | | KNN | | √ | | | | | MaskedConv | | √ | √ | | √ | | MergeCells | | √ | | | | | MinAreaPolygon | | √ | | | | | ModulatedDeformConv2d | √ | √ | √ | | √ | | MultiScaleDeformableAttn | | √ | √ | | √ | | NMS | √ | √ | √ | | √ | | NMSRotated | √ | √ | √ | | √ | | NMSQuadri | √ | √ | | | | | PixelGroup | √ | | | | | | PointsInBoxes | √ | √ | | | | | PointsInPolygons | | √ | | | | | PSAMask | √ | √ | √ | | √ | | RotatedFeatureAlign | √ | √ | √ | | √ | | RoIPointPool3d | | √ | √ | | | | RoIPool | | √ | √ | | √ | | RoIAlignRotated | √ | √ | √ | | √ | | RiRoIAlignRotated | | √ | | | | | RoIAlign | √ | √ | √ | | √ | | RoIAwarePool3d | | √ | √ | | | | SAConv2d | | √ | | | | | SigmoidFocalLoss | | √ | √ | | √ | | SoftmaxFocalLoss | | √ | | | √ | | SoftNMS | | √ | | | | | Sparse Convolution | | √ | √ | | | | Synchronized BatchNorm | | √ | | | | | ThreeInterpolate | | √ | | | | | ThreeNN | | √ | √ | | | | TINShift | | √ | √ | | | | UpFirDn2d | | √ | | | | | Voxelization | √ | √ | √ | | √ | | PrRoIPool | | √ | | | | | BezierAlign | √ | √ | | | | | BiasAct | | √ | | | | | FilteredLrelu | | √ | | | | | Conv2dGradfix | | √ | | | | ================================================ FILE: docs/zh_cn/understand_mmcv/visualization.md ================================================ ## 可视化 `mmcv` 可以展示图像以及标注(目前只支持标注框) ```python # 展示图像文件 mmcv.imshow('a.jpg') # 展示已加载的图像 img = np.random.rand(100, 100, 3) mmcv.imshow(img) # 展示带有标注框的图像 img = np.random.rand(100, 100, 3) bboxes = np.array([[0, 0, 50, 50], [20, 20, 60, 60]]) mmcv.imshow_bboxes(img, bboxes) ``` `mmcv` 也可以展示特殊的图像,例如光流 ```python flow = mmcv.flowread('test.flo') mmcv.flowshow(flow) ``` ================================================ FILE: mmcv/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # flake8: noqa from .arraymisc import * from .image import * from .transforms import * from .version import * from .video import * from .visualization import * # The following modules are not imported to this level, so mmcv may be used # without PyTorch. # - op # - utils ================================================ FILE: mmcv/arraymisc/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .quantization import dequantize, quantize __all__ = ['quantize', 'dequantize'] ================================================ FILE: mmcv/arraymisc/quantization.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Union import numpy as np def quantize(arr: np.ndarray, min_val: Union[int, float], max_val: Union[int, float], levels: int, dtype=np.int64) -> tuple: """Quantize an array of (-inf, inf) to [0, levels-1]. Args: arr (ndarray): Input array. min_val (int or float): Minimum value to be clipped. max_val (int or float): Maximum value to be clipped. levels (int): Quantization levels. dtype (np.type): The type of the quantized array. Returns: tuple: Quantized array. """ if not (isinstance(levels, int) and levels > 1): raise ValueError( f'levels must be a positive integer, but got {levels}') if min_val >= max_val: raise ValueError( f'min_val ({min_val}) must be smaller than max_val ({max_val})') arr = np.clip(arr, min_val, max_val) - min_val quantized_arr = np.minimum( np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1) return quantized_arr def dequantize(arr: np.ndarray, min_val: Union[int, float], max_val: Union[int, float], levels: int, dtype=np.float64) -> tuple: """Dequantize an array. Args: arr (ndarray): Input array. min_val (int or float): Minimum value to be clipped. max_val (int or float): Maximum value to be clipped. levels (int): Quantization levels. dtype (np.type): The type of the dequantized array. Returns: tuple: Dequantized array. """ if not (isinstance(levels, int) and levels > 1): raise ValueError( f'levels must be a positive integer, but got {levels}') if min_val >= max_val: raise ValueError( f'min_val ({min_val}) must be smaller than max_val ({max_val})') dequantized_arr = (arr + 0.5).astype(dtype) * (max_val - min_val) / levels + min_val return dequantized_arr ================================================ FILE: mmcv/cnn/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .alexnet import AlexNet # yapf: disable from .bricks import (ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule, ConvTranspose2d, ConvTranspose3d, ConvWS2d, DepthwiseSeparableConvModule, GeneralizedAttention, HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d, NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish, build_activation_layer, build_conv_layer, build_norm_layer, build_padding_layer, build_plugin_layer, build_upsample_layer, conv_ws_2d, is_norm) # yapf: enable from .resnet import ResNet, make_res_layer from .rfsearch import Conv2dRFSearchOp, RFSearchHook from .utils import fuse_conv_bn, get_model_complexity_info from .vgg import VGG, make_vgg_layer __all__ = [ 'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer', 'ConvModule', 'build_activation_layer', 'build_conv_layer', 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', 'GeneralizedAttention', 'Scale', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'fuse_conv_bn', 'get_model_complexity_info', 'Conv2dRFSearchOp', 'RFSearchHook' ] ================================================ FILE: mmcv/cnn/alexnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import logging from typing import Optional import torch import torch.nn as nn from mmengine.runner import load_checkpoint class AlexNet(nn.Module): """AlexNet backbone. Args: num_classes (int): number of classes for classification. """ def __init__(self, num_classes: int = -1): super().__init__() self.num_classes = num_classes self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(64, 192, kernel_size=5, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(192, 384, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), ) if self.num_classes > 0: self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(256 * 6 * 6, 4096), nn.ReLU(inplace=True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(inplace=True), nn.Linear(4096, num_classes), ) def init_weights(self, pretrained: Optional[str] = None) -> None: if isinstance(pretrained, str): logger = logging.getLogger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: # use default initializer pass else: raise TypeError('pretrained must be a str or None') def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.features(x) if self.num_classes > 0: x = x.view(x.size(0), 256 * 6 * 6) x = self.classifier(x) return x ================================================ FILE: mmcv/cnn/bricks/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .activation import build_activation_layer from .context_block import ContextBlock from .conv import build_conv_layer from .conv2d_adaptive_padding import Conv2dAdaptivePadding from .conv_module import ConvModule from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d from .depthwise_separable_conv_module import DepthwiseSeparableConvModule from .drop import Dropout, DropPath from .generalized_attention import GeneralizedAttention from .hsigmoid import HSigmoid from .hswish import HSwish from .non_local import NonLocal1d, NonLocal2d, NonLocal3d from .norm import build_norm_layer, is_norm from .padding import build_padding_layer from .plugin import build_plugin_layer from .scale import LayerScale, Scale from .swish import Swish from .upsample import build_upsample_layer from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d, Linear, MaxPool2d, MaxPool3d) __all__ = [ 'ConvModule', 'build_activation_layer', 'build_conv_layer', 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d', 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention', 'Scale', 'ConvAWS2d', 'ConvWS2d', 'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear', 'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath', 'LayerScale' ] ================================================ FILE: mmcv/cnn/bricks/activation.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Dict import torch import torch.nn as nn import torch.nn.functional as F from mmengine.registry import MODELS from mmengine.utils import digit_version from mmengine.utils.dl_utils import TORCH_VERSION for module in [ nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU, nn.Sigmoid, nn.Tanh ]: MODELS.register_module(module=module) if digit_version(torch.__version__) >= digit_version('1.7.0'): MODELS.register_module(module=nn.SiLU, name='SiLU') else: class SiLU(nn.Module): """Sigmoid Weighted Liner Unit.""" def __init__(self, inplace=False): super().__init__() self.inplace = inplace def forward(self, inputs) -> torch.Tensor: if self.inplace: return inputs.mul_(torch.sigmoid(inputs)) else: return inputs * torch.sigmoid(inputs) MODELS.register_module(module=SiLU, name='SiLU') @MODELS.register_module(name='Clip') @MODELS.register_module() class Clamp(nn.Module): """Clamp activation layer. This activation function is to clamp the feature map value within :math:`[min, max]`. More details can be found in ``torch.clamp()``. Args: min (Number | optional): Lower-bound of the range to be clamped to. Default to -1. max (Number | optional): Upper-bound of the range to be clamped to. Default to 1. """ def __init__(self, min: float = -1., max: float = 1.): super().__init__() self.min = min self.max = max def forward(self, x) -> torch.Tensor: """Forward function. Args: x (torch.Tensor): The input tensor. Returns: torch.Tensor: Clamped tensor. """ return torch.clamp(x, min=self.min, max=self.max) class GELU(nn.Module): r"""Applies the Gaussian Error Linear Units function: .. math:: \text{GELU}(x) = x * \Phi(x) where :math:`\Phi(x)` is the Cumulative Distribution Function for Gaussian Distribution. Shape: - Input: :math:`(N, *)` where `*` means, any number of additional dimensions - Output: :math:`(N, *)`, same shape as the input .. image:: scripts/activation_images/GELU.png Examples:: >>> m = nn.GELU() >>> input = torch.randn(2) >>> output = m(input) """ def forward(self, input: torch.Tensor) -> torch.Tensor: return F.gelu(input) if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.4')): MODELS.register_module(module=GELU) else: MODELS.register_module(module=nn.GELU) def build_activation_layer(cfg: Dict) -> nn.Module: """Build activation layer. Args: cfg (dict): The activation layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate an activation layer. Returns: nn.Module: Created activation layer. """ return MODELS.build(cfg) ================================================ FILE: mmcv/cnn/bricks/context_block.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Union import torch from mmengine.model import constant_init, kaiming_init from mmengine.registry import MODELS from torch import nn def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None: if isinstance(m, nn.Sequential): constant_init(m[-1], val=0) else: constant_init(m, val=0) @MODELS.register_module() class ContextBlock(nn.Module): """ContextBlock module in GCNet. See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond' (https://arxiv.org/abs/1904.11492) for details. Args: in_channels (int): Channels of the input feature map. ratio (float): Ratio of channels of transform bottleneck pooling_type (str): Pooling method for context modeling. Options are 'att' and 'avg', stand for attention pooling and average pooling respectively. Default: 'att'. fusion_types (Sequence[str]): Fusion method for feature fusion, Options are 'channels_add', 'channel_mul', stand for channelwise addition and multiplication respectively. Default: ('channel_add',) """ _abbr_ = 'context_block' def __init__(self, in_channels: int, ratio: float, pooling_type: str = 'att', fusion_types: tuple = ('channel_add', )): super().__init__() assert pooling_type in ['avg', 'att'] assert isinstance(fusion_types, (list, tuple)) valid_fusion_types = ['channel_add', 'channel_mul'] assert all([f in valid_fusion_types for f in fusion_types]) assert len(fusion_types) > 0, 'at least one fusion should be used' self.in_channels = in_channels self.ratio = ratio self.planes = int(in_channels * ratio) self.pooling_type = pooling_type self.fusion_types = fusion_types if pooling_type == 'att': self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1) self.softmax = nn.Softmax(dim=2) else: self.avg_pool = nn.AdaptiveAvgPool2d(1) if 'channel_add' in fusion_types: self.channel_add_conv = nn.Sequential( nn.Conv2d(self.in_channels, self.planes, kernel_size=1), nn.LayerNorm([self.planes, 1, 1]), nn.ReLU(inplace=True), # yapf: disable nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) else: self.channel_add_conv = None if 'channel_mul' in fusion_types: self.channel_mul_conv = nn.Sequential( nn.Conv2d(self.in_channels, self.planes, kernel_size=1), nn.LayerNorm([self.planes, 1, 1]), nn.ReLU(inplace=True), # yapf: disable nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) else: self.channel_mul_conv = None self.reset_parameters() def reset_parameters(self): if self.pooling_type == 'att': kaiming_init(self.conv_mask, mode='fan_in') self.conv_mask.inited = True if self.channel_add_conv is not None: last_zero_init(self.channel_add_conv) if self.channel_mul_conv is not None: last_zero_init(self.channel_mul_conv) def spatial_pool(self, x: torch.Tensor) -> torch.Tensor: batch, channel, height, width = x.size() if self.pooling_type == 'att': input_x = x # [N, C, H * W] input_x = input_x.view(batch, channel, height * width) # [N, 1, C, H * W] input_x = input_x.unsqueeze(1) # [N, 1, H, W] context_mask = self.conv_mask(x) # [N, 1, H * W] context_mask = context_mask.view(batch, 1, height * width) # [N, 1, H * W] context_mask = self.softmax(context_mask) # [N, 1, H * W, 1] context_mask = context_mask.unsqueeze(-1) # [N, 1, C, 1] context = torch.matmul(input_x, context_mask) # [N, C, 1, 1] context = context.view(batch, channel, 1, 1) else: # [N, C, 1, 1] context = self.avg_pool(x) return context def forward(self, x: torch.Tensor) -> torch.Tensor: # [N, C, 1, 1] context = self.spatial_pool(x) out = x if self.channel_mul_conv is not None: # [N, C, 1, 1] channel_mul_term = torch.sigmoid(self.channel_mul_conv(context)) out = out * channel_mul_term if self.channel_add_conv is not None: # [N, C, 1, 1] channel_add_term = self.channel_add_conv(context) out = out + channel_add_term return out ================================================ FILE: mmcv/cnn/bricks/conv.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import inspect from typing import Dict, Optional from mmengine.registry import MODELS from torch import nn MODELS.register_module('Conv1d', module=nn.Conv1d) MODELS.register_module('Conv2d', module=nn.Conv2d) MODELS.register_module('Conv3d', module=nn.Conv3d) MODELS.register_module('Conv', module=nn.Conv2d) def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module: """Build convolution layer. Args: cfg (None or dict): The conv layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate an conv layer. args (argument list): Arguments passed to the `__init__` method of the corresponding conv layer. kwargs (keyword arguments): Keyword arguments passed to the `__init__` method of the corresponding conv layer. Returns: nn.Module: Created conv layer. """ if cfg is None: cfg_ = dict(type='Conv2d') else: if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() layer_type = cfg_.pop('type') if inspect.isclass(layer_type): return layer_type(*args, **kwargs, **cfg_) # type: ignore # Switch registry to the target scope. If `conv_layer` cannot be found # in the registry, fallback to search `conv_layer` in the # mmengine.MODELS. with MODELS.switch_scope_and_registry(None) as registry: conv_layer = registry.get(layer_type) if conv_layer is None: raise KeyError(f'Cannot find {conv_layer} in registry under scope ' f'name {registry.scope}') layer = conv_layer(*args, **kwargs, **cfg_) return layer ================================================ FILE: mmcv/cnn/bricks/conv2d_adaptive_padding.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math from typing import Tuple, Union import torch from mmengine.registry import MODELS from torch import nn from torch.nn import functional as F @MODELS.register_module() class Conv2dAdaptivePadding(nn.Conv2d): """Implementation of 2D convolution in tensorflow with `padding` as "same", which applies padding to input (if needed) so that input image gets fully covered by filter and stride you specified. For stride 1, this will ensure that output image size is same as input. For stride of 2, output dimensions will be half, for example. Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution kernel_size (int or tuple): Size of the convolving kernel stride (int or tuple, optional): Stride of the convolution. Default: 1 padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0 dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 bias (bool, optional): If ``True``, adds a learnable bias to the output. Default: ``True`` """ def __init__(self, in_channels: int, out_channels: int, kernel_size: Union[int, Tuple[int, int]], stride: Union[int, Tuple[int, int]] = 1, padding: Union[int, Tuple[int, int]] = 0, dilation: Union[int, Tuple[int, int]] = 1, groups: int = 1, bias: bool = True): super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) def forward(self, x: torch.Tensor) -> torch.Tensor: img_h, img_w = x.size()[-2:] kernel_h, kernel_w = self.weight.size()[-2:] stride_h, stride_w = self.stride output_h = math.ceil(img_h / stride_h) output_w = math.ceil(img_w / stride_w) pad_h = ( max((output_h - 1) * self.stride[0] + (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0)) pad_w = ( max((output_w - 1) * self.stride[1] + (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0)) if pad_h > 0 or pad_w > 0: x = F.pad(x, [ pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 ]) return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) ================================================ FILE: mmcv/cnn/bricks/conv_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from functools import partial from typing import Dict, Optional, Tuple, Union import torch import torch.nn as nn from mmengine.model import constant_init, kaiming_init from mmengine.registry import MODELS from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm from .activation import build_activation_layer from .conv import build_conv_layer from .norm import build_norm_layer from .padding import build_padding_layer def efficient_conv_bn_eval_forward(bn: _BatchNorm, conv: nn.modules.conv._ConvNd, x: torch.Tensor): """ Implementation based on https://arxiv.org/abs/2305.11624 "Tune-Mode ConvBN Blocks For Efficient Transfer Learning" It leverages the associative law between convolution and affine transform, i.e., normalize (weight conv feature) = (normalize weight) conv feature. It works for Eval mode of ConvBN blocks during validation, and can be used for training as well. It reduces memory and computation cost. Args: bn (_BatchNorm): a BatchNorm module. conv (nn._ConvNd): a conv module x (torch.Tensor): Input feature map. """ # These lines of code are designed to deal with various cases # like bn without affine transform, and conv without bias weight_on_the_fly = conv.weight if conv.bias is not None: bias_on_the_fly = conv.bias else: bias_on_the_fly = torch.zeros_like(bn.running_var) if bn.weight is not None: bn_weight = bn.weight else: bn_weight = torch.ones_like(bn.running_var) if bn.bias is not None: bn_bias = bn.bias else: bn_bias = torch.zeros_like(bn.running_var) # shape of [C_out, 1, 1, 1] in Conv2d weight_coeff = torch.rsqrt(bn.running_var + bn.eps).reshape([-1] + [1] * (len(conv.weight.shape) - 1)) # shape of [C_out, 1, 1, 1] in Conv2d coefff_on_the_fly = bn_weight.view_as(weight_coeff) * weight_coeff # shape of [C_out, C_in, k, k] in Conv2d weight_on_the_fly = weight_on_the_fly * coefff_on_the_fly # shape of [C_out] in Conv2d bias_on_the_fly = bn_bias + coefff_on_the_fly.flatten() *\ (bias_on_the_fly - bn.running_mean) return conv._conv_forward(x, weight_on_the_fly, bias_on_the_fly) @MODELS.register_module() class ConvModule(nn.Module): """A conv block that bundles conv/norm/activation layers. This block simplifies the usage of convolution layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). It is based upon three build methods: `build_conv_layer()`, `build_norm_layer()` and `build_activation_layer()`. Besides, we add some additional features in this module. 1. Automatically set `bias` of the conv layer. 2. Spectral norm is supported. 3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only supports zero and circular padding, and we add "reflect" padding mode. Args: in_channels (int): Number of channels in the input feature map. Same as that in ``nn._ConvNd``. out_channels (int): Number of channels produced by the convolution. Same as that in ``nn._ConvNd``. kernel_size (int | tuple[int]): Size of the convolving kernel. Same as that in ``nn._ConvNd``. stride (int | tuple[int]): Stride of the convolution. Same as that in ``nn._ConvNd``. padding (int | tuple[int]): Zero-padding added to both sides of the input. Same as that in ``nn._ConvNd``. dilation (int | tuple[int]): Spacing between kernel elements. Same as that in ``nn._ConvNd``. groups (int): Number of blocked connections from input channels to output channels. Same as that in ``nn._ConvNd``. bias (bool | str): If specified as `auto`, it will be decided by the norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise False. Default: "auto". conv_cfg (dict): Config dict for convolution layer. Default: None, which means using conv2d. norm_cfg (dict): Config dict for normalization layer. Default: None. act_cfg (dict): Config dict for activation layer. Default: dict(type='ReLU'). inplace (bool): Whether to use inplace mode for activation. Default: True. with_spectral_norm (bool): Whether use spectral norm in conv module. Default: False. padding_mode (str): If the `padding_mode` has not been supported by current `Conv2d` in PyTorch, we will use our own padding layer instead. Currently, we support ['zeros', 'circular'] with official implementation and ['reflect'] with our own implementation. Default: 'zeros'. order (tuple[str]): The order of conv/norm/activation layers. It is a sequence of "conv", "norm" and "act". Common examples are ("conv", "norm", "act") and ("act", "conv", "norm"). Default: ('conv', 'norm', 'act'). efficient_conv_bn_eval (bool): Whether use efficient conv when the consecutive bn is in eval mode (either training or testing), as proposed in https://arxiv.org/abs/2305.11624 . Default: `False`. """ _abbr_ = 'conv_block' def __init__(self, in_channels: int, out_channels: int, kernel_size: Union[int, Tuple[int, int]], stride: Union[int, Tuple[int, int]] = 1, padding: Union[int, Tuple[int, int]] = 0, dilation: Union[int, Tuple[int, int]] = 1, groups: int = 1, bias: Union[bool, str] = 'auto', conv_cfg: Optional[Dict] = None, norm_cfg: Optional[Dict] = None, act_cfg: Optional[Dict] = dict(type='ReLU'), inplace: bool = True, with_spectral_norm: bool = False, padding_mode: str = 'zeros', order: tuple = ('conv', 'norm', 'act'), efficient_conv_bn_eval: bool = False): super().__init__() assert conv_cfg is None or isinstance(conv_cfg, dict) assert norm_cfg is None or isinstance(norm_cfg, dict) assert act_cfg is None or isinstance(act_cfg, dict) official_padding_mode = ['zeros', 'circular'] self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.inplace = inplace self.with_spectral_norm = with_spectral_norm self.with_explicit_padding = padding_mode not in official_padding_mode self.order = order assert isinstance(self.order, tuple) and len(self.order) == 3 assert set(order) == {'conv', 'norm', 'act'} self.with_norm = norm_cfg is not None self.with_activation = act_cfg is not None # if the conv layer is before a norm layer, bias is unnecessary. if bias == 'auto': bias = not self.with_norm self.with_bias = bias if self.with_explicit_padding: pad_cfg = dict(type=padding_mode) self.padding_layer = build_padding_layer(pad_cfg, padding) # reset padding to 0 for conv module conv_padding = 0 if self.with_explicit_padding else padding # build convolution layer self.conv = build_conv_layer( conv_cfg, in_channels, out_channels, kernel_size, stride=stride, padding=conv_padding, dilation=dilation, groups=groups, bias=bias) # export the attributes of self.conv to a higher level for convenience self.in_channels = self.conv.in_channels self.out_channels = self.conv.out_channels self.kernel_size = self.conv.kernel_size self.stride = self.conv.stride self.padding = padding self.dilation = self.conv.dilation self.transposed = self.conv.transposed self.output_padding = self.conv.output_padding self.groups = self.conv.groups if self.with_spectral_norm: self.conv = nn.utils.spectral_norm(self.conv) # build normalization layers if self.with_norm: # norm layer is after conv layer if order.index('norm') > order.index('conv'): norm_channels = out_channels else: norm_channels = in_channels self.norm_name, norm = build_norm_layer( norm_cfg, norm_channels) # type: ignore self.add_module(self.norm_name, norm) if self.with_bias: if isinstance(norm, (_BatchNorm, _InstanceNorm)): warnings.warn( 'Unnecessary conv bias before batch/instance norm') else: self.norm_name = None # type: ignore self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval) # build activation layer if self.with_activation: act_cfg_ = act_cfg.copy() # type: ignore # nn.Tanh has no 'inplace' argument if act_cfg_['type'] not in [ 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU' ]: act_cfg_.setdefault('inplace', inplace) self.activate = build_activation_layer(act_cfg_) # Use msra init by default self.init_weights() @property def norm(self): if self.norm_name: return getattr(self, self.norm_name) else: return None def init_weights(self): # 1. It is mainly for customized conv layers with their own # initialization manners by calling their own ``init_weights()``, # and we do not want ConvModule to override the initialization. # 2. For customized conv layers without their own initialization # manners (that is, they don't have their own ``init_weights()``) # and PyTorch's conv layers, they will be initialized by # this method with default ``kaiming_init``. # Note: For PyTorch's conv layers, they will be overwritten by our # initialization implementation using default ``kaiming_init``. if not hasattr(self.conv, 'init_weights'): if self.with_activation and self.act_cfg['type'] == 'LeakyReLU': nonlinearity = 'leaky_relu' a = self.act_cfg.get('negative_slope', 0.01) else: nonlinearity = 'relu' a = 0 kaiming_init(self.conv, a=a, nonlinearity=nonlinearity) if self.with_norm: constant_init(self.norm, 1, bias=0) def forward(self, x: torch.Tensor, activate: bool = True, norm: bool = True) -> torch.Tensor: layer_index = 0 while layer_index < len(self.order): layer = self.order[layer_index] if layer == 'conv': if self.with_explicit_padding: x = self.padding_layer(x) # if the next operation is norm and we have a norm layer in # eval mode and we have enabled `efficient_conv_bn_eval` for # the conv operator, then activate the optimized forward and # skip the next norm operator since it has been fused if layer_index + 1 < len(self.order) and \ self.order[layer_index + 1] == 'norm' and norm and \ self.with_norm and not self.norm.training and \ self.efficient_conv_bn_eval_forward is not None: self.conv.forward = partial( self.efficient_conv_bn_eval_forward, self.norm, self.conv) layer_index += 1 x = self.conv(x) del self.conv.forward else: x = self.conv(x) elif layer == 'norm' and norm and self.with_norm: x = self.norm(x) elif layer == 'act' and activate and self.with_activation: x = self.activate(x) layer_index += 1 return x def turn_on_efficient_conv_bn_eval(self, efficient_conv_bn_eval=True): # efficient_conv_bn_eval works for conv + bn # with `track_running_stats` option if efficient_conv_bn_eval and self.norm \ and isinstance(self.norm, _BatchNorm) \ and self.norm.track_running_stats: self.efficient_conv_bn_eval_forward = efficient_conv_bn_eval_forward # noqa: E501 else: self.efficient_conv_bn_eval_forward = None # type: ignore @staticmethod def create_from_conv_bn(conv: torch.nn.modules.conv._ConvNd, bn: torch.nn.modules.batchnorm._BatchNorm, efficient_conv_bn_eval=True) -> 'ConvModule': """Create a ConvModule from a conv and a bn module.""" self = ConvModule.__new__(ConvModule) super(ConvModule, self).__init__() self.conv_cfg = None self.norm_cfg = None self.act_cfg = None self.inplace = False self.with_spectral_norm = False self.with_explicit_padding = False self.order = ('conv', 'norm', 'act') self.with_norm = True self.with_activation = False self.with_bias = conv.bias is not None # build convolution layer self.conv = conv # export the attributes of self.conv to a higher level for convenience self.in_channels = self.conv.in_channels self.out_channels = self.conv.out_channels self.kernel_size = self.conv.kernel_size self.stride = self.conv.stride self.padding = self.conv.padding self.dilation = self.conv.dilation self.transposed = self.conv.transposed self.output_padding = self.conv.output_padding self.groups = self.conv.groups # build normalization layers self.norm_name, norm = 'bn', bn self.add_module(self.norm_name, norm) self.turn_on_efficient_conv_bn_eval(efficient_conv_bn_eval) return self ================================================ FILE: mmcv/cnn/bricks/conv_ws.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from collections import OrderedDict from typing import Dict, List, Optional, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F from mmengine.registry import MODELS def conv_ws_2d(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Union[int, Tuple[int, int]] = 1, padding: Union[int, Tuple[int, int]] = 0, dilation: Union[int, Tuple[int, int]] = 1, groups: int = 1, eps: float = 1e-5) -> torch.Tensor: c_in = weight.size(0) weight_flat = weight.view(c_in, -1) mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1) weight = (weight - mean) / (std + eps) return F.conv2d(input, weight, bias, stride, padding, dilation, groups) @MODELS.register_module('ConvWS') class ConvWS2d(nn.Conv2d): def __init__(self, in_channels: int, out_channels: int, kernel_size: Union[int, Tuple[int, int]], stride: Union[int, Tuple[int, int]] = 1, padding: Union[int, Tuple[int, int]] = 0, dilation: Union[int, Tuple[int, int]] = 1, groups: int = 1, bias: bool = True, eps: float = 1e-5): super().__init__( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.eps = eps def forward(self, x: torch.Tensor) -> torch.Tensor: return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.eps) @MODELS.register_module(name='ConvAWS') class ConvAWS2d(nn.Conv2d): """AWS (Adaptive Weight Standardization) This is a variant of Weight Standardization (https://arxiv.org/pdf/1903.10520.pdf) It is used in DetectoRS to avoid NaN (https://arxiv.org/pdf/2006.02334.pdf) Args: in_channels (int): Number of channels in the input image out_channels (int): Number of channels produced by the convolution kernel_size (int or tuple): Size of the conv kernel stride (int or tuple, optional): Stride of the convolution. Default: 1 padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0 dilation (int or tuple, optional): Spacing between kernel elements. Default: 1 groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 bias (bool, optional): If set True, adds a learnable bias to the output. Default: True """ def __init__(self, in_channels: int, out_channels: int, kernel_size: Union[int, Tuple[int, int]], stride: Union[int, Tuple[int, int]] = 1, padding: Union[int, Tuple[int, int]] = 0, dilation: Union[int, Tuple[int, int]] = 1, groups: int = 1, bias: bool = True): super().__init__( in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) self.register_buffer('weight_gamma', torch.ones(self.out_channels, 1, 1, 1)) self.register_buffer('weight_beta', torch.zeros(self.out_channels, 1, 1, 1)) def _get_weight(self, weight: torch.Tensor) -> torch.Tensor: weight_flat = weight.view(weight.size(0), -1) mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) weight = (weight - mean) / std weight = self.weight_gamma * weight + self.weight_beta return weight def forward(self, x: torch.Tensor) -> torch.Tensor: weight = self._get_weight(self.weight) return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str, local_metadata: Dict, strict: bool, missing_keys: List[str], unexpected_keys: List[str], error_msgs: List[str]) -> None: """Override default load function. AWS overrides the function _load_from_state_dict to recover weight_gamma and weight_beta if they are missing. If weight_gamma and weight_beta are found in the checkpoint, this function will return after super()._load_from_state_dict. Otherwise, it will compute the mean and std of the pretrained weights and store them in weight_beta and weight_gamma. """ self.weight_gamma.data.fill_(-1) local_missing_keys: List = [] super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, local_missing_keys, unexpected_keys, error_msgs) if self.weight_gamma.data.mean() > 0: for k in local_missing_keys: missing_keys.append(k) return weight = self.weight.data weight_flat = weight.view(weight.size(0), -1) mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) self.weight_beta.data.copy_(mean) self.weight_gamma.data.copy_(std) missing_gamma_beta = [ k for k in local_missing_keys if k.endswith('weight_gamma') or k.endswith('weight_beta') ] for k in missing_gamma_beta: local_missing_keys.remove(k) for k in local_missing_keys: missing_keys.append(k) ================================================ FILE: mmcv/cnn/bricks/depthwise_separable_conv_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Dict, Optional, Tuple, Union import torch import torch.nn as nn from .conv_module import ConvModule class DepthwiseSeparableConvModule(nn.Module): """Depthwise separable convolution module. See https://arxiv.org/pdf/1704.04861.pdf for details. This module can replace a ConvModule with the conv block replaced by two conv block: depthwise conv block and pointwise conv block. The depthwise conv block contains depthwise-conv/norm/activation layers. The pointwise conv block contains pointwise-conv/norm/activation layers. It should be noted that there will be norm/activation layer in the depthwise conv block if `norm_cfg` and `act_cfg` are specified. Args: in_channels (int): Number of channels in the input feature map. Same as that in ``nn._ConvNd``. out_channels (int): Number of channels produced by the convolution. Same as that in ``nn._ConvNd``. kernel_size (int | tuple[int]): Size of the convolving kernel. Same as that in ``nn._ConvNd``. stride (int | tuple[int]): Stride of the convolution. Same as that in ``nn._ConvNd``. Default: 1. padding (int | tuple[int]): Zero-padding added to both sides of the input. Same as that in ``nn._ConvNd``. Default: 0. dilation (int | tuple[int]): Spacing between kernel elements. Same as that in ``nn._ConvNd``. Default: 1. norm_cfg (dict): Default norm config for both depthwise ConvModule and pointwise ConvModule. Default: None. act_cfg (dict): Default activation config for both depthwise ConvModule and pointwise ConvModule. Default: dict(type='ReLU'). dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is 'default', it will be the same as `norm_cfg`. Default: 'default'. dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is 'default', it will be the same as `act_cfg`. Default: 'default'. pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is 'default', it will be the same as `norm_cfg`. Default: 'default'. pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is 'default', it will be the same as `act_cfg`. Default: 'default'. kwargs (optional): Other shared arguments for depthwise and pointwise ConvModule. See ConvModule for ref. """ def __init__(self, in_channels: int, out_channels: int, kernel_size: Union[int, Tuple[int, int]], stride: Union[int, Tuple[int, int]] = 1, padding: Union[int, Tuple[int, int]] = 0, dilation: Union[int, Tuple[int, int]] = 1, norm_cfg: Optional[Dict] = None, act_cfg: Dict = dict(type='ReLU'), dw_norm_cfg: Union[Dict, str] = 'default', dw_act_cfg: Union[Dict, str] = 'default', pw_norm_cfg: Union[Dict, str] = 'default', pw_act_cfg: Union[Dict, str] = 'default', **kwargs): super().__init__() assert 'groups' not in kwargs, 'groups should not be specified' # if norm/activation config of depthwise/pointwise ConvModule is not # specified, use default config. dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501 dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501 pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg # depthwise convolution self.depthwise_conv = ConvModule( in_channels, in_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=in_channels, norm_cfg=dw_norm_cfg, # type: ignore act_cfg=dw_act_cfg, # type: ignore **kwargs) self.pointwise_conv = ConvModule( in_channels, out_channels, 1, norm_cfg=pw_norm_cfg, # type: ignore act_cfg=pw_act_cfg, # type: ignore **kwargs) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.depthwise_conv(x) x = self.pointwise_conv(x) return x ================================================ FILE: mmcv/cnn/bricks/drop.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Any, Dict, Optional import torch import torch.nn as nn from mmengine.registry import MODELS def drop_path(x: torch.Tensor, drop_prob: float = 0., training: bool = False) -> torch.Tensor: """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). We follow the implementation https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 """ if drop_prob == 0. or not training: return x keep_prob = 1 - drop_prob # handle tensors with different dimensions, not just 4D tensors. shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) random_tensor = keep_prob + torch.rand( shape, dtype=x.dtype, device=x.device) output = x.div(keep_prob) * random_tensor.floor() return output @MODELS.register_module() class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). We follow the implementation https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 Args: drop_prob (float): Probability of the path to be zeroed. Default: 0.1 """ def __init__(self, drop_prob: float = 0.1): super().__init__() self.drop_prob = drop_prob def forward(self, x: torch.Tensor) -> torch.Tensor: return drop_path(x, self.drop_prob, self.training) @MODELS.register_module() class Dropout(nn.Dropout): """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with ``DropPath`` Args: drop_prob (float): Probability of the elements to be zeroed. Default: 0.5. inplace (bool): Do the operation inplace or not. Default: False. """ def __init__(self, drop_prob: float = 0.5, inplace: bool = False): super().__init__(p=drop_prob, inplace=inplace) def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any: """Builder for drop out layers.""" return MODELS.build(cfg, default_args=default_args) ================================================ FILE: mmcv/cnn/bricks/generalized_attention.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmengine.model import kaiming_init from mmengine.registry import MODELS @MODELS.register_module() class GeneralizedAttention(nn.Module): """GeneralizedAttention module. See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks' (https://arxiv.org/abs/1904.05873) for details. Args: in_channels (int): Channels of the input feature map. spatial_range (int): The spatial range. -1 indicates no spatial range constraint. Default: -1. num_heads (int): The head number of empirical_attention module. Default: 9. position_embedding_dim (int): The position embedding dimension. Default: -1. position_magnitude (int): A multiplier acting on coord difference. Default: 1. kv_stride (int): The feature stride acting on key/value feature map. Default: 2. q_stride (int): The feature stride acting on query feature map. Default: 1. attention_type (str): A binary indicator string for indicating which items in generalized empirical_attention module are used. Default: '1111'. - '1000' indicates 'query and key content' (appr - appr) item, - '0100' indicates 'query content and relative position' (appr - position) item, - '0010' indicates 'key content only' (bias - appr) item, - '0001' indicates 'relative position only' (bias - position) item. """ _abbr_ = 'gen_attention_block' def __init__(self, in_channels: int, spatial_range: int = -1, num_heads: int = 9, position_embedding_dim: int = -1, position_magnitude: int = 1, kv_stride: int = 2, q_stride: int = 1, attention_type: str = '1111'): super().__init__() # hard range means local range for non-local operation self.position_embedding_dim = ( position_embedding_dim if position_embedding_dim > 0 else in_channels) self.position_magnitude = position_magnitude self.num_heads = num_heads self.in_channels = in_channels self.spatial_range = spatial_range self.kv_stride = kv_stride self.q_stride = q_stride self.attention_type = [bool(int(_)) for _ in attention_type] self.qk_embed_dim = in_channels // num_heads out_c = self.qk_embed_dim * num_heads if self.attention_type[0] or self.attention_type[1]: self.query_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_c, kernel_size=1, bias=False) self.query_conv.kaiming_init = True if self.attention_type[0] or self.attention_type[2]: self.key_conv = nn.Conv2d( in_channels=in_channels, out_channels=out_c, kernel_size=1, bias=False) self.key_conv.kaiming_init = True self.v_dim = in_channels // num_heads self.value_conv = nn.Conv2d( in_channels=in_channels, out_channels=self.v_dim * num_heads, kernel_size=1, bias=False) self.value_conv.kaiming_init = True if self.attention_type[1] or self.attention_type[3]: self.appr_geom_fc_x = nn.Linear( self.position_embedding_dim // 2, out_c, bias=False) self.appr_geom_fc_x.kaiming_init = True self.appr_geom_fc_y = nn.Linear( self.position_embedding_dim // 2, out_c, bias=False) self.appr_geom_fc_y.kaiming_init = True if self.attention_type[2]: stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv self.appr_bias = nn.Parameter(appr_bias_value) if self.attention_type[3]: stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv self.geom_bias = nn.Parameter(geom_bias_value) self.proj_conv = nn.Conv2d( in_channels=self.v_dim * num_heads, out_channels=in_channels, kernel_size=1, bias=True) self.proj_conv.kaiming_init = True self.gamma = nn.Parameter(torch.zeros(1)) if self.spatial_range >= 0: # only works when non local is after 3*3 conv if in_channels == 256: max_len = 84 elif in_channels == 512: max_len = 42 max_len_kv = int((max_len - 1.0) / self.kv_stride + 1) local_constraint_map = np.ones( (max_len, max_len, max_len_kv, max_len_kv), dtype=int) for iy in range(max_len): for ix in range(max_len): local_constraint_map[ iy, ix, max((iy - self.spatial_range) // self.kv_stride, 0):min((iy + self.spatial_range + 1) // self.kv_stride + 1, max_len), max((ix - self.spatial_range) // self.kv_stride, 0):min((ix + self.spatial_range + 1) // self.kv_stride + 1, max_len)] = 0 self.local_constraint_map = nn.Parameter( torch.from_numpy(local_constraint_map).byte(), requires_grad=False) if self.q_stride > 1: self.q_downsample = nn.AvgPool2d( kernel_size=1, stride=self.q_stride) else: self.q_downsample = None if self.kv_stride > 1: self.kv_downsample = nn.AvgPool2d( kernel_size=1, stride=self.kv_stride) else: self.kv_downsample = None self.init_weights() def get_position_embedding(self, h, w, h_kv, w_kv, q_stride, kv_stride, device, dtype, feat_dim, wave_length=1000): # the default type of Tensor is float32, leading to type mismatch # in fp16 mode. Cast it to support fp16 mode. h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype) h_idxs = h_idxs.view((h, 1)) * q_stride w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype) w_idxs = w_idxs.view((w, 1)) * q_stride h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to( device=device, dtype=dtype) h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to( device=device, dtype=dtype) w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride # (h, h_kv, 1) h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0) h_diff *= self.position_magnitude # (w, w_kv, 1) w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0) w_diff *= self.position_magnitude feat_range = torch.arange(0, feat_dim / 4).to( device=device, dtype=dtype) dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype) dim_mat = dim_mat**((4. / feat_dim) * feat_range) dim_mat = dim_mat.view((1, 1, -1)) embedding_x = torch.cat( ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2) embedding_y = torch.cat( ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2) return embedding_x, embedding_y def forward(self, x_input: torch.Tensor) -> torch.Tensor: num_heads = self.num_heads # use empirical_attention if self.q_downsample is not None: x_q = self.q_downsample(x_input) else: x_q = x_input n, _, h, w = x_q.shape if self.kv_downsample is not None: x_kv = self.kv_downsample(x_input) else: x_kv = x_input _, _, h_kv, w_kv = x_kv.shape if self.attention_type[0] or self.attention_type[1]: proj_query = self.query_conv(x_q).view( (n, num_heads, self.qk_embed_dim, h * w)) proj_query = proj_query.permute(0, 1, 3, 2) if self.attention_type[0] or self.attention_type[2]: proj_key = self.key_conv(x_kv).view( (n, num_heads, self.qk_embed_dim, h_kv * w_kv)) if self.attention_type[1] or self.attention_type[3]: position_embed_x, position_embed_y = self.get_position_embedding( h, w, h_kv, w_kv, self.q_stride, self.kv_stride, x_input.device, x_input.dtype, self.position_embedding_dim) # (n, num_heads, w, w_kv, dim) position_feat_x = self.appr_geom_fc_x(position_embed_x).\ view(1, w, w_kv, num_heads, self.qk_embed_dim).\ permute(0, 3, 1, 2, 4).\ repeat(n, 1, 1, 1, 1) # (n, num_heads, h, h_kv, dim) position_feat_y = self.appr_geom_fc_y(position_embed_y).\ view(1, h, h_kv, num_heads, self.qk_embed_dim).\ permute(0, 3, 1, 2, 4).\ repeat(n, 1, 1, 1, 1) position_feat_x /= math.sqrt(2) position_feat_y /= math.sqrt(2) # accelerate for saliency only if (np.sum(self.attention_type) == 1) and self.attention_type[2]: appr_bias = self.appr_bias.\ view(1, num_heads, 1, self.qk_embed_dim).\ repeat(n, 1, 1, 1) energy = torch.matmul(appr_bias, proj_key).\ view(n, num_heads, 1, h_kv * w_kv) h = 1 w = 1 else: # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for if not self.attention_type[0]: energy = torch.zeros( n, num_heads, h, w, h_kv, w_kv, dtype=x_input.dtype, device=x_input.device) # attention_type[0]: appr - appr # attention_type[1]: appr - position # attention_type[2]: bias - appr # attention_type[3]: bias - position if self.attention_type[0] or self.attention_type[2]: if self.attention_type[0] and self.attention_type[2]: appr_bias = self.appr_bias.\ view(1, num_heads, 1, self.qk_embed_dim) energy = torch.matmul(proj_query + appr_bias, proj_key).\ view(n, num_heads, h, w, h_kv, w_kv) elif self.attention_type[0]: energy = torch.matmul(proj_query, proj_key).\ view(n, num_heads, h, w, h_kv, w_kv) elif self.attention_type[2]: appr_bias = self.appr_bias.\ view(1, num_heads, 1, self.qk_embed_dim).\ repeat(n, 1, 1, 1) energy += torch.matmul(appr_bias, proj_key).\ view(n, num_heads, 1, 1, h_kv, w_kv) if self.attention_type[1] or self.attention_type[3]: if self.attention_type[1] and self.attention_type[3]: geom_bias = self.geom_bias.\ view(1, num_heads, 1, self.qk_embed_dim) proj_query_reshape = (proj_query + geom_bias).\ view(n, num_heads, h, w, self.qk_embed_dim) energy_x = torch.matmul( proj_query_reshape.permute(0, 1, 3, 2, 4), position_feat_x.permute(0, 1, 2, 4, 3)) energy_x = energy_x.\ permute(0, 1, 3, 2, 4).unsqueeze(4) energy_y = torch.matmul( proj_query_reshape, position_feat_y.permute(0, 1, 2, 4, 3)) energy_y = energy_y.unsqueeze(5) energy += energy_x + energy_y elif self.attention_type[1]: proj_query_reshape = proj_query.\ view(n, num_heads, h, w, self.qk_embed_dim) proj_query_reshape = proj_query_reshape.\ permute(0, 1, 3, 2, 4) position_feat_x_reshape = position_feat_x.\ permute(0, 1, 2, 4, 3) position_feat_y_reshape = position_feat_y.\ permute(0, 1, 2, 4, 3) energy_x = torch.matmul(proj_query_reshape, position_feat_x_reshape) energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4) energy_y = torch.matmul(proj_query_reshape, position_feat_y_reshape) energy_y = energy_y.unsqueeze(5) energy += energy_x + energy_y elif self.attention_type[3]: geom_bias = self.geom_bias.\ view(1, num_heads, self.qk_embed_dim, 1).\ repeat(n, 1, 1, 1) position_feat_x_reshape = position_feat_x.\ view(n, num_heads, w * w_kv, self.qk_embed_dim) position_feat_y_reshape = position_feat_y.\ view(n, num_heads, h * h_kv, self.qk_embed_dim) energy_x = torch.matmul(position_feat_x_reshape, geom_bias) energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv) energy_y = torch.matmul(position_feat_y_reshape, geom_bias) energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1) energy += energy_x + energy_y energy = energy.view(n, num_heads, h * w, h_kv * w_kv) if self.spatial_range >= 0: cur_local_constraint_map = \ self.local_constraint_map[:h, :w, :h_kv, :w_kv].\ contiguous().\ view(1, 1, h*w, h_kv*w_kv) energy = energy.masked_fill_(cur_local_constraint_map.bool(), float('-inf')) attention = F.softmax(energy, 3) proj_value = self.value_conv(x_kv) proj_value_reshape = proj_value.\ view((n, num_heads, self.v_dim, h_kv * w_kv)).\ permute(0, 1, 3, 2) out = torch.matmul(attention, proj_value_reshape).\ permute(0, 1, 3, 2).\ contiguous().\ view(n, self.v_dim * self.num_heads, h, w) out = self.proj_conv(out) # output is downsampled, upsample back to input size if self.q_downsample is not None: out = F.interpolate( out, size=x_input.shape[2:], mode='bilinear', align_corners=False) out = self.gamma * out + x_input return out def init_weights(self): for m in self.modules(): if hasattr(m, 'kaiming_init') and m.kaiming_init: kaiming_init( m, mode='fan_in', nonlinearity='leaky_relu', bias=0, distribution='uniform', a=1) ================================================ FILE: mmcv/cnn/bricks/hsigmoid.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import torch import torch.nn as nn from mmengine.registry import MODELS @MODELS.register_module() class HSigmoid(nn.Module): """Hard Sigmoid Module. Apply the hard sigmoid function: Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value) Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1) Note: In MMCV v1.4.4, we modified the default value of args to align with PyTorch official. Args: bias (float): Bias of the input feature map. Default: 3.0. divisor (float): Divisor of the input feature map. Default: 6.0. min_value (float): Lower bound value. Default: 0.0. max_value (float): Upper bound value. Default: 1.0. Returns: Tensor: The output tensor. """ def __init__(self, bias: float = 3.0, divisor: float = 6.0, min_value: float = 0.0, max_value: float = 1.0): super().__init__() warnings.warn( 'In MMCV v1.4.4, we modified the default value of args to align ' 'with PyTorch official. Previous Implementation: ' 'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). ' 'Current Implementation: ' 'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).') self.bias = bias self.divisor = divisor assert self.divisor != 0 self.min_value = min_value self.max_value = max_value def forward(self, x: torch.Tensor) -> torch.Tensor: x = (x + self.bias) / self.divisor return x.clamp_(self.min_value, self.max_value) ================================================ FILE: mmcv/cnn/bricks/hswish.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn from mmengine.registry import MODELS from mmengine.utils import digit_version from mmengine.utils.dl_utils import TORCH_VERSION class HSwish(nn.Module): """Hard Swish Module. This module applies the hard swish function: .. math:: Hswish(x) = x * ReLU6(x + 3) / 6 Args: inplace (bool): can optionally do the operation in-place. Default: False. Returns: Tensor: The output tensor. """ def __init__(self, inplace: bool = False): super().__init__() self.act = nn.ReLU6(inplace) def forward(self, x: torch.Tensor) -> torch.Tensor: return x * self.act(x + 3) / 6 if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.7')): # Hardswish is not supported when PyTorch version < 1.6. # And Hardswish in PyTorch 1.6 does not support inplace. MODELS.register_module(module=HSwish) else: MODELS.register_module(module=nn.Hardswish, name='HSwish') ================================================ FILE: mmcv/cnn/bricks/non_local.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta from typing import Dict, Optional import torch import torch.nn as nn from mmengine.model import constant_init, normal_init from mmengine.registry import MODELS from .conv_module import ConvModule class _NonLocalNd(nn.Module, metaclass=ABCMeta): """Basic Non-local module. This module is proposed in "Non-local Neural Networks" Paper reference: https://arxiv.org/abs/1711.07971 Code reference: https://github.com/AlexHex7/Non-local_pytorch Args: in_channels (int): Channels of the input feature map. reduction (int): Channel reduction ratio. Default: 2. use_scale (bool): Whether to scale pairwise_weight by `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. Default: True. conv_cfg (None | dict): The config dict for convolution layers. If not specified, it will use `nn.Conv2d` for convolution layers. Default: None. norm_cfg (None | dict): The config dict for normalization layers. Default: None. (This parameter is only applicable to conv_out.) mode (str): Options are `gaussian`, `concatenation`, `embedded_gaussian` and `dot_product`. Default: embedded_gaussian. """ def __init__(self, in_channels: int, reduction: int = 2, use_scale: bool = True, conv_cfg: Optional[Dict] = None, norm_cfg: Optional[Dict] = None, mode: str = 'embedded_gaussian', **kwargs): super().__init__() self.in_channels = in_channels self.reduction = reduction self.use_scale = use_scale self.inter_channels = max(in_channels // reduction, 1) self.mode = mode if mode not in [ 'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation' ]: raise ValueError("Mode should be in 'gaussian', 'concatenation', " f"'embedded_gaussian' or 'dot_product', but got " f'{mode} instead.') # g, theta, phi are defaulted as `nn.ConvNd`. # Here we use ConvModule for potential usage. self.g = ConvModule( self.in_channels, self.inter_channels, kernel_size=1, conv_cfg=conv_cfg, act_cfg=None) # type: ignore self.conv_out = ConvModule( self.inter_channels, self.in_channels, kernel_size=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=None) if self.mode != 'gaussian': self.theta = ConvModule( self.in_channels, self.inter_channels, kernel_size=1, conv_cfg=conv_cfg, act_cfg=None) self.phi = ConvModule( self.in_channels, self.inter_channels, kernel_size=1, conv_cfg=conv_cfg, act_cfg=None) if self.mode == 'concatenation': self.concat_project = ConvModule( self.inter_channels * 2, 1, kernel_size=1, stride=1, padding=0, bias=False, act_cfg=dict(type='ReLU')) self.init_weights(**kwargs) def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None: if self.mode != 'gaussian': for m in [self.g, self.theta, self.phi]: normal_init(m.conv, std=std) else: normal_init(self.g.conv, std=std) if zeros_init: if self.conv_out.norm_cfg is None: constant_init(self.conv_out.conv, 0) else: constant_init(self.conv_out.norm, 0) else: if self.conv_out.norm_cfg is None: normal_init(self.conv_out.conv, std=std) else: normal_init(self.conv_out.norm, std=std) def gaussian(self, theta_x: torch.Tensor, phi_x: torch.Tensor) -> torch.Tensor: # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] pairwise_weight = torch.matmul(theta_x, phi_x) pairwise_weight = pairwise_weight.softmax(dim=-1) return pairwise_weight def embedded_gaussian(self, theta_x: torch.Tensor, phi_x: torch.Tensor) -> torch.Tensor: # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] pairwise_weight = torch.matmul(theta_x, phi_x) if self.use_scale: # theta_x.shape[-1] is `self.inter_channels` pairwise_weight /= theta_x.shape[-1]**0.5 pairwise_weight = pairwise_weight.softmax(dim=-1) return pairwise_weight def dot_product(self, theta_x: torch.Tensor, phi_x: torch.Tensor) -> torch.Tensor: # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] pairwise_weight = torch.matmul(theta_x, phi_x) pairwise_weight /= pairwise_weight.shape[-1] return pairwise_weight def concatenation(self, theta_x: torch.Tensor, phi_x: torch.Tensor) -> torch.Tensor: # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] h = theta_x.size(2) w = phi_x.size(3) theta_x = theta_x.repeat(1, 1, 1, w) phi_x = phi_x.repeat(1, 1, h, 1) concat_feature = torch.cat([theta_x, phi_x], dim=1) pairwise_weight = self.concat_project(concat_feature) n, _, h, w = pairwise_weight.size() pairwise_weight = pairwise_weight.view(n, h, w) pairwise_weight /= pairwise_weight.shape[-1] return pairwise_weight def forward(self, x: torch.Tensor) -> torch.Tensor: # Assume `reduction = 1`, then `inter_channels = C` # or `inter_channels = C` when `mode="gaussian"` # NonLocal1d x: [N, C, H] # NonLocal2d x: [N, C, H, W] # NonLocal3d x: [N, C, T, H, W] n = x.size(0) # NonLocal1d g_x: [N, H, C] # NonLocal2d g_x: [N, HxW, C] # NonLocal3d g_x: [N, TxHxW, C] g_x = self.g(x).view(n, self.inter_channels, -1) g_x = g_x.permute(0, 2, 1) # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H] # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW] # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW] if self.mode == 'gaussian': theta_x = x.view(n, self.in_channels, -1) theta_x = theta_x.permute(0, 2, 1) if self.sub_sample: phi_x = self.phi(x).view(n, self.in_channels, -1) else: phi_x = x.view(n, self.in_channels, -1) elif self.mode == 'concatenation': theta_x = self.theta(x).view(n, self.inter_channels, -1, 1) phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) else: theta_x = self.theta(x).view(n, self.inter_channels, -1) theta_x = theta_x.permute(0, 2, 1) phi_x = self.phi(x).view(n, self.inter_channels, -1) pairwise_func = getattr(self, self.mode) # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] pairwise_weight = pairwise_func(theta_x, phi_x) # NonLocal1d y: [N, H, C] # NonLocal2d y: [N, HxW, C] # NonLocal3d y: [N, TxHxW, C] y = torch.matmul(pairwise_weight, g_x) # NonLocal1d y: [N, C, H] # NonLocal2d y: [N, C, H, W] # NonLocal3d y: [N, C, T, H, W] y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels, *x.size()[2:]) output = x + self.conv_out(y) return output class NonLocal1d(_NonLocalNd): """1D Non-local module. Args: in_channels (int): Same as `NonLocalND`. sub_sample (bool): Whether to apply max pooling after pairwise function (Note that the `sub_sample` is applied on spatial only). Default: False. conv_cfg (None | dict): Same as `NonLocalND`. Default: dict(type='Conv1d'). """ def __init__(self, in_channels: int, sub_sample: bool = False, conv_cfg: Dict = dict(type='Conv1d'), **kwargs): super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample if sub_sample: max_pool_layer = nn.MaxPool1d(kernel_size=2) self.g = nn.Sequential(self.g, max_pool_layer) if self.mode != 'gaussian': self.phi = nn.Sequential(self.phi, max_pool_layer) else: self.phi = max_pool_layer @MODELS.register_module() class NonLocal2d(_NonLocalNd): """2D Non-local module. Args: in_channels (int): Same as `NonLocalND`. sub_sample (bool): Whether to apply max pooling after pairwise function (Note that the `sub_sample` is applied on spatial only). Default: False. conv_cfg (None | dict): Same as `NonLocalND`. Default: dict(type='Conv2d'). """ _abbr_ = 'nonlocal_block' def __init__(self, in_channels: int, sub_sample: bool = False, conv_cfg: Dict = dict(type='Conv2d'), **kwargs): super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample if sub_sample: max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) self.g = nn.Sequential(self.g, max_pool_layer) if self.mode != 'gaussian': self.phi = nn.Sequential(self.phi, max_pool_layer) else: self.phi = max_pool_layer class NonLocal3d(_NonLocalNd): """3D Non-local module. Args: in_channels (int): Same as `NonLocalND`. sub_sample (bool): Whether to apply max pooling after pairwise function (Note that the `sub_sample` is applied on spatial only). Default: False. conv_cfg (None | dict): Same as `NonLocalND`. Default: dict(type='Conv3d'). """ def __init__(self, in_channels: int, sub_sample: bool = False, conv_cfg: Dict = dict(type='Conv3d'), **kwargs): super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample if sub_sample: max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) self.g = nn.Sequential(self.g, max_pool_layer) if self.mode != 'gaussian': self.phi = nn.Sequential(self.phi, max_pool_layer) else: self.phi = max_pool_layer ================================================ FILE: mmcv/cnn/bricks/norm.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import inspect from typing import Dict, Tuple, Union import torch.nn as nn from mmengine.registry import MODELS from mmengine.utils import is_tuple_of from mmengine.utils.dl_utils.parrots_wrapper import (SyncBatchNorm, _BatchNorm, _InstanceNorm) MODELS.register_module('BN', module=nn.BatchNorm2d) MODELS.register_module('BN1d', module=nn.BatchNorm1d) MODELS.register_module('BN2d', module=nn.BatchNorm2d) MODELS.register_module('BN3d', module=nn.BatchNorm3d) MODELS.register_module('SyncBN', module=SyncBatchNorm) MODELS.register_module('GN', module=nn.GroupNorm) MODELS.register_module('LN', module=nn.LayerNorm) MODELS.register_module('IN', module=nn.InstanceNorm2d) MODELS.register_module('IN1d', module=nn.InstanceNorm1d) MODELS.register_module('IN2d', module=nn.InstanceNorm2d) MODELS.register_module('IN3d', module=nn.InstanceNorm3d) def infer_abbr(class_type): """Infer abbreviation from the class name. When we build a norm layer with `build_norm_layer()`, we want to preserve the norm type in variable names, e.g, self.bn1, self.gn. This method will infer the abbreviation to map class types to abbreviations. Rule 1: If the class has the property "_abbr_", return the property. Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and "in" respectively. Rule 3: If the class name contains "batch", "group", "layer" or "instance", the abbreviation of this layer will be "bn", "gn", "ln" and "in" respectively. Rule 4: Otherwise, the abbreviation falls back to "norm". Args: class_type (type): The norm layer type. Returns: str: The inferred abbreviation. """ if not inspect.isclass(class_type): raise TypeError( f'class_type must be a type, but got {type(class_type)}') if hasattr(class_type, '_abbr_'): return class_type._abbr_ if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN return 'in' elif issubclass(class_type, _BatchNorm): return 'bn' elif issubclass(class_type, nn.GroupNorm): return 'gn' elif issubclass(class_type, nn.LayerNorm): return 'ln' else: class_name = class_type.__name__.lower() if 'batch' in class_name: return 'bn' elif 'group' in class_name: return 'gn' elif 'layer' in class_name: return 'ln' elif 'instance' in class_name: return 'in' else: return 'norm_layer' def build_norm_layer(cfg: Dict, num_features: int, postfix: Union[int, str] = '') -> Tuple[str, nn.Module]: """Build normalization layer. Args: cfg (dict): The norm layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate a norm layer. - requires_grad (bool, optional): Whether stop gradient updates. num_features (int): Number of input channels. postfix (int | str): The postfix to be appended into norm abbreviation to create named layer. Returns: tuple[str, nn.Module]: The first element is the layer name consisting of abbreviation and postfix, e.g., bn1, gn. The second element is the created norm layer. """ if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() layer_type = cfg_.pop('type') if inspect.isclass(layer_type): norm_layer = layer_type else: # Switch registry to the target scope. If `norm_layer` cannot be found # in the registry, fallback to search `norm_layer` in the # mmengine.MODELS. with MODELS.switch_scope_and_registry(None) as registry: norm_layer = registry.get(layer_type) if norm_layer is None: raise KeyError(f'Cannot find {norm_layer} in registry under ' f'scope name {registry.scope}') abbr = infer_abbr(norm_layer) assert isinstance(postfix, (int, str)) name = abbr + str(postfix) requires_grad = cfg_.pop('requires_grad', True) cfg_.setdefault('eps', 1e-5) if norm_layer is not nn.GroupNorm: layer = norm_layer(num_features, **cfg_) if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'): layer._specify_ddp_gpu_num(1) else: assert 'num_groups' in cfg_ layer = norm_layer(num_channels=num_features, **cfg_) for param in layer.parameters(): param.requires_grad = requires_grad return name, layer def is_norm(layer: nn.Module, exclude: Union[type, tuple, None] = None) -> bool: """Check if a layer is a normalization layer. Args: layer (nn.Module): The layer to be checked. exclude (type | tuple[type]): Types to be excluded. Returns: bool: Whether the layer is a norm layer. """ if exclude is not None: if not isinstance(exclude, tuple): exclude = (exclude, ) if not is_tuple_of(exclude, type): raise TypeError( f'"exclude" must be either None or type or a tuple of types, ' f'but got {type(exclude)}: {exclude}') if exclude and isinstance(layer, exclude): return False all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm) return isinstance(layer, all_norm_bases) ================================================ FILE: mmcv/cnn/bricks/padding.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import inspect from typing import Dict import torch.nn as nn from mmengine.registry import MODELS MODELS.register_module('zero', module=nn.ZeroPad2d) MODELS.register_module('reflect', module=nn.ReflectionPad2d) MODELS.register_module('replicate', module=nn.ReplicationPad2d) def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module: """Build padding layer. Args: cfg (dict): The padding layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate a padding layer. Returns: nn.Module: Created padding layer. """ if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() padding_type = cfg_.pop('type') if inspect.isclass(padding_type): return padding_type(*args, **kwargs, **cfg_) # Switch registry to the target scope. If `padding_layer` cannot be found # in the registry, fallback to search `padding_layer` in the # mmengine.MODELS. with MODELS.switch_scope_and_registry(None) as registry: padding_layer = registry.get(padding_type) if padding_layer is None: raise KeyError(f'Cannot find {padding_layer} in registry under scope ' f'name {registry.scope}') layer = padding_layer(*args, **kwargs, **cfg_) return layer ================================================ FILE: mmcv/cnn/bricks/plugin.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import inspect import platform from typing import Dict, Tuple, Union import torch.nn as nn from mmengine.registry import MODELS if platform.system() == 'Windows': import regex as re # type: ignore else: import re # type: ignore def infer_abbr(class_type: type) -> str: """Infer abbreviation from the class name. This method will infer the abbreviation to map class types to abbreviations. Rule 1: If the class has the property "abbr", return the property. Rule 2: Otherwise, the abbreviation falls back to snake case of class name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``. Args: class_type (type): The norm layer type. Returns: str: The inferred abbreviation. """ def camel2snack(word): """Convert camel case word into snack case. Modified from `inflection lib `_. Example:: >>> camel2snack("FancyBlock") 'fancy_block' """ word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word) word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word) word = word.replace('-', '_') return word.lower() if not inspect.isclass(class_type): raise TypeError( f'class_type must be a type, but got {type(class_type)}') if hasattr(class_type, '_abbr_'): return class_type._abbr_ # type: ignore else: return camel2snack(class_type.__name__) def build_plugin_layer(cfg: Dict, postfix: Union[int, str] = '', **kwargs) -> Tuple[str, nn.Module]: """Build plugin layer. Args: cfg (dict): cfg should contain: - type (str): identify plugin layer type. - layer args: args needed to instantiate a plugin layer. postfix (int, str): appended into norm abbreviation to create named layer. Default: ''. Returns: tuple[str, nn.Module]: The first one is the concatenation of abbreviation and postfix. The second is the created plugin layer. """ if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() layer_type = cfg_.pop('type') if inspect.isclass(layer_type): plugin_layer = layer_type else: # Switch registry to the target scope. If `plugin_layer` cannot be # found in the registry, fallback to search `plugin_layer` in the # mmengine.MODELS. with MODELS.switch_scope_and_registry(None) as registry: plugin_layer = registry.get(layer_type) if plugin_layer is None: raise KeyError( f'Cannot find {plugin_layer} in registry under scope ' f'name {registry.scope}') abbr = infer_abbr(plugin_layer) assert isinstance(postfix, (int, str)) name = abbr + str(postfix) layer = plugin_layer(**kwargs, **cfg_) return name, layer ================================================ FILE: mmcv/cnn/bricks/scale.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn class Scale(nn.Module): """A learnable scale parameter. This layer scales the input by a learnable factor. It multiplies a learnable scale parameter of shape (1,) with input of any shape. Args: scale (float): Initial value of scale factor. Default: 1.0 """ def __init__(self, scale: float = 1.0): super().__init__() self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) def forward(self, x: torch.Tensor) -> torch.Tensor: return x * self.scale class LayerScale(nn.Module): """LayerScale layer. Args: dim (int): Dimension of input features. inplace (bool): Whether performs operation in-place. Default: `False`. data_format (str): The input data format, could be 'channels_last' or 'channels_first', representing (B, C, H, W) and (B, N, C) format data respectively. Default: 'channels_last'. scale (float): Initial value of scale factor. Default: 1.0 """ def __init__(self, dim: int, inplace: bool = False, data_format: str = 'channels_last', scale: float = 1e-5): super().__init__() assert data_format in ('channels_last', 'channels_first'), \ "'data_format' could only be channels_last or channels_first." self.inplace = inplace self.data_format = data_format self.weight = nn.Parameter(torch.ones(dim) * scale) def forward(self, x) -> torch.Tensor: if self.data_format == 'channels_first': shape = tuple((1, -1, *(1 for _ in range(x.dim() - 2)))) else: shape = tuple((*(1 for _ in range(x.dim() - 1)), -1)) if self.inplace: return x.mul_(self.weight.view(*shape)) else: return x * self.weight.view(*shape) ================================================ FILE: mmcv/cnn/bricks/swish.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn from mmengine.registry import MODELS @MODELS.register_module() class Swish(nn.Module): """Swish Module. This module applies the swish function: .. math:: Swish(x) = x * Sigmoid(x) Returns: Tensor: The output tensor. """ def __init__(self): super().__init__() def forward(self, x: torch.Tensor) -> torch.Tensor: return x * torch.sigmoid(x) ================================================ FILE: mmcv/cnn/bricks/transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import math import warnings from typing import Sequence import torch import torch.nn as nn import torch.nn.functional as F from mmengine.config import ConfigDict from mmengine.model import BaseModule, ModuleList, Sequential from mmengine.registry import MODELS from mmengine.utils import deprecated_api_warning, to_2tuple from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer, build_norm_layer) from .drop import build_dropout from .scale import LayerScale # Avoid BC-breaking of importing MultiScaleDeformableAttention from this file try: from mmcv.ops.multi_scale_deform_attn import \ MultiScaleDeformableAttention # noqa F401 warnings.warn( ImportWarning( '``MultiScaleDeformableAttention`` has been moved to ' '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 )) except ImportError: warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' '``mmcv.ops.multi_scale_deform_attn``, ' 'You should install ``mmcv`` rather than ``mmcv-lite`` ' 'if you need this module. ') def build_positional_encoding(cfg, default_args=None): """Builder for Position Encoding.""" return MODELS.build(cfg, default_args=default_args) def build_attention(cfg, default_args=None): """Builder for attention.""" return MODELS.build(cfg, default_args=default_args) def build_feedforward_network(cfg, default_args=None): """Builder for feed-forward network (FFN).""" return MODELS.build(cfg, default_args=default_args) def build_transformer_layer(cfg, default_args=None): """Builder for transformer layer.""" return MODELS.build(cfg, default_args=default_args) def build_transformer_layer_sequence(cfg, default_args=None): """Builder for transformer encoder and transformer decoder.""" return MODELS.build(cfg, default_args=default_args) class AdaptivePadding(nn.Module): """Applies padding adaptively to the input. This module can make input get fully covered by filter you specified. It support two modes "same" and "corner". The "same" mode is same with "SAME" padding mode in TensorFlow, pad zero around input. The "corner" mode would pad zero to bottom right. Args: kernel_size (int | tuple): Size of the kernel. Default: 1. stride (int | tuple): Stride of the filter. Default: 1. dilation (int | tuple): Spacing between kernel elements. Default: 1. padding (str): Support "same" and "corner", "corner" mode would pad zero to bottom right, and "same" mode would pad zero around input. Default: "corner". Example: >>> kernel_size = 16 >>> stride = 16 >>> dilation = 1 >>> input = torch.rand(1, 1, 15, 17) >>> adap_pad = AdaptivePadding( >>> kernel_size=kernel_size, >>> stride=stride, >>> dilation=dilation, >>> padding="corner") >>> out = adap_pad(input) >>> assert (out.shape[2], out.shape[3]) == (16, 32) >>> input = torch.rand(1, 1, 16, 17) >>> out = adap_pad(input) >>> assert (out.shape[2], out.shape[3]) == (16, 32) """ def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): super().__init__() assert padding in ('same', 'corner') kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) self.padding = padding self.kernel_size = kernel_size self.stride = stride self.dilation = dilation def get_pad_shape(self, input_shape): """Calculate the padding size of input. Args: input_shape (:obj:`torch.Size`): arrange as (H, W). Returns: Tuple[int]: The padding size along the original H and W directions """ input_h, input_w = input_shape kernel_h, kernel_w = self.kernel_size stride_h, stride_w = self.stride output_h = math.ceil(input_h / stride_h) output_w = math.ceil(input_w / stride_w) pad_h = max((output_h - 1) * stride_h + (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0) pad_w = max((output_w - 1) * stride_w + (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0) return pad_h, pad_w def forward(self, x): """Add padding to `x` Args: x (Tensor): Input tensor has shape (B, C, H, W). Returns: Tensor: The tensor with adaptive padding """ pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) if pad_h > 0 or pad_w > 0: if self.padding == 'corner': x = F.pad(x, [0, pad_w, 0, pad_h]) elif self.padding == 'same': x = F.pad(x, [ pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 ]) return x class PatchEmbed(BaseModule): """Image to Patch Embedding. We use a conv layer to implement PatchEmbed. Args: in_channels (int): The num of input channels. Default: 3 embed_dims (int): The dimensions of embedding. Default: 768 conv_type (str): The type of convolution to generate patch embedding. Default: "Conv2d". kernel_size (int): The kernel_size of embedding conv. Default: 16. stride (int): The slide stride of embedding conv. Default: 16. padding (int | tuple | string): The padding length of embedding conv. When it is a string, it means the mode of adaptive padding, support "same" and "corner" now. Default: "corner". dilation (int): The dilation rate of embedding conv. Default: 1. bias (bool): Bias of embed conv. Default: True. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. input_size (int | tuple | None): The size of input, which will be used to calculate the out size. Only works when `dynamic_size` is False. Default: None. init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. Default: None. """ def __init__(self, in_channels=3, embed_dims=768, conv_type='Conv2d', kernel_size=16, stride=16, padding='corner', dilation=1, bias=True, norm_cfg=None, input_size=None, init_cfg=None): super().__init__(init_cfg=init_cfg) self.embed_dims = embed_dims if stride is None: stride = kernel_size kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) if isinstance(padding, str): self.adaptive_padding = AdaptivePadding( kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding) # disable the padding of conv padding = 0 else: self.adaptive_padding = None padding = to_2tuple(padding) self.projection = build_conv_layer( dict(type=conv_type), in_channels=in_channels, out_channels=embed_dims, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias) if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, embed_dims)[1] else: self.norm = None if input_size: input_size = to_2tuple(input_size) # `init_out_size` would be used outside to # calculate the num_patches # e.g. when `use_abs_pos_embed` outside self.init_input_size = input_size if self.adaptive_padding: pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size) input_h, input_w = input_size input_h = input_h + pad_h input_w = input_w + pad_w input_size = (input_h, input_w) # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html h_out = (input_size[0] + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) // stride[0] + 1 w_out = (input_size[1] + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) // stride[1] + 1 self.init_out_size = (h_out, w_out) else: self.init_input_size = None self.init_out_size = None def forward(self, x): """ Args: x (Tensor): Has shape (B, C, H, W). In most case, C is 3. Returns: tuple: Contains merged results and its spatial shape. - x (Tensor): Has shape (B, out_h * out_w, embed_dims) - out_size (tuple[int]): Spatial shape of x, arrange as (out_h, out_w). """ if self.adaptive_padding: x = self.adaptive_padding(x) x = self.projection(x) out_size = (x.shape[2], x.shape[3]) x = x.flatten(2).transpose(1, 2) if self.norm is not None: x = self.norm(x) return x, out_size class PatchMerging(BaseModule): """Merge patch feature map. This layer groups feature map by kernel_size, and applies norm and linear layers to the grouped feature map ((used in Swin Transformer)). Our implementation uses `nn.Unfold` to merge patches, which is about 25% faster than the original implementation. However, we need to modify pretrained models for compatibility. Args: in_channels (int): The num of input channels. to gets fully covered by filter and stride you specified. out_channels (int): The num of output channels. kernel_size (int | tuple, optional): the kernel size in the unfold layer. Defaults to 2. stride (int | tuple, optional): the stride of the sliding blocks in the unfold layer. Default: None. (Would be set as `kernel_size`) padding (int | tuple | string ): The padding length of embedding conv. When it is a string, it means the mode of adaptive padding, support "same" and "corner" now. Default: "corner". dilation (int | tuple, optional): dilation parameter in the unfold layer. Default: 1. bias (bool, optional): Whether to add bias in linear layer or not. Defaults: False. norm_cfg (dict, optional): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (dict, optional): The extra config for initialization. Default: None. """ def __init__(self, in_channels, out_channels, kernel_size=2, stride=None, padding='corner', dilation=1, bias=False, norm_cfg=dict(type='LN'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels if stride: stride = stride else: stride = kernel_size kernel_size = to_2tuple(kernel_size) stride = to_2tuple(stride) dilation = to_2tuple(dilation) if isinstance(padding, str): self.adaptive_padding = AdaptivePadding( kernel_size=kernel_size, stride=stride, dilation=dilation, padding=padding) # disable the padding of unfold padding = 0 else: self.adaptive_padding = None padding = to_2tuple(padding) self.sampler = nn.Unfold( kernel_size=kernel_size, dilation=dilation, padding=padding, stride=stride) sample_dim = kernel_size[0] * kernel_size[1] * in_channels if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, sample_dim)[1] else: self.norm = None self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) def forward(self, x, input_size): """ Args: x (Tensor): Has shape (B, H*W, C_in). input_size (tuple[int]): The spatial shape of x, arrange as (H, W). Default: None. Returns: tuple: Contains merged results and its spatial shape. - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out) - out_size (tuple[int]): Spatial shape of x, arrange as (Merged_H, Merged_W). """ B, L, C = x.shape assert isinstance(input_size, Sequence), f'Expect ' \ f'input_size is ' \ f'`Sequence` ' \ f'but get {input_size}' H, W = input_size assert L == H * W, 'input feature has wrong size' x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W if self.adaptive_padding: x = self.adaptive_padding(x) H, W = x.shape[-2:] # Use nn.Unfold to merge patch. About 25% faster than original method, # but need to modify pretrained model for compatibility # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) x = self.sampler(x) out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * (self.sampler.kernel_size[0] - 1) - 1) // self.sampler.stride[0] + 1 out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * (self.sampler.kernel_size[1] - 1) - 1) // self.sampler.stride[1] + 1 output_size = (out_h, out_w) x = x.transpose(1, 2) # B, H/2*W/2, 4*C x = self.norm(x) if self.norm else x x = self.reduction(x) return x, output_size @MODELS.register_module() class MultiheadAttention(BaseModule): """A wrapper for ``torch.nn.MultiheadAttention``. This module implements MultiheadAttention with identity connection, and positional encoding is also passed as input. Args: embed_dims (int): The embedding dimension. num_heads (int): Parallel attention heads. attn_drop (float): A Dropout layer on attn_output_weights. Default: 0.0. proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. Default: 0.0. dropout_layer (obj:`ConfigDict`): The dropout_layer used when adding the shortcut. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): When it is True, Key, Query and Value are shape of (batch, n, embed_dim), otherwise (n, batch, embed_dim). Default to False. """ def __init__(self, embed_dims, num_heads, attn_drop=0., proj_drop=0., dropout_layer=dict(type='Dropout', drop_prob=0.), init_cfg=None, batch_first=False, **kwargs): super().__init__(init_cfg) if 'dropout' in kwargs: warnings.warn( 'The arguments `dropout` in MultiheadAttention ' 'has been deprecated, now you can separately ' 'set `attn_drop`(float), proj_drop(float), ' 'and `dropout_layer`(dict) ', DeprecationWarning) attn_drop = kwargs['dropout'] dropout_layer['drop_prob'] = kwargs.pop('dropout') self.embed_dims = embed_dims self.num_heads = num_heads self.batch_first = batch_first self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs) self.proj_drop = nn.Dropout(proj_drop) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else nn.Identity() @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiheadAttention') def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_pos=None, attn_mask=None, key_padding_mask=None, **kwargs): """Forward function for `MultiheadAttention`. **kwargs allow passing a more general data flow when combining with other operations in `transformerlayer`. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . If None, the ``query`` will be used. Defaults to None. value (Tensor): The value tensor with same shape as `key`. Same in `nn.MultiheadAttention.forward`. Defaults to None. If None, the `key` will be used. identity (Tensor): This tensor, with the same shape as x, will be used for the identity link. If None, `x` will be used. Defaults to None. query_pos (Tensor): The positional encoding for query, with the same shape as `x`. If not None, it will be added to `x` before forward function. Defaults to None. key_pos (Tensor): The positional encoding for `key`, with the same shape as `key`. Defaults to None. If not None, it will be added to `key` before forward function. If None, and `query_pos` has the same shape as `key`, then `query_pos` will be used for `key_pos`. Defaults to None. attn_mask (Tensor): ByteTensor mask with shape [num_queries, num_keys]. Same in `nn.MultiheadAttention.forward`. Defaults to None. key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. Defaults to None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. """ if key is None: key = query if value is None: value = key if identity is None: identity = query if key_pos is None: if query_pos is not None: # use query_pos if key_pos is not available if query_pos.shape == key.shape: key_pos = query_pos else: warnings.warn(f'position encoding of key is' f'missing in {self.__class__.__name__}.') if query_pos is not None: query = query + query_pos if key_pos is not None: key = key + key_pos # Because the dataflow('key', 'query', 'value') of # ``torch.nn.MultiheadAttention`` is (num_query, batch, # embed_dims), We should adjust the shape of dataflow from # batch_first (batch, num_query, embed_dims) to num_query_first # (num_query ,batch, embed_dims), and recover ``attn_output`` # from num_query_first to batch_first. if self.batch_first: query = query.transpose(0, 1) key = key.transpose(0, 1) value = value.transpose(0, 1) out = self.attn( query=query, key=key, value=value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)[0] if self.batch_first: out = out.transpose(0, 1) return identity + self.dropout_layer(self.proj_drop(out)) @MODELS.register_module() class FFN(BaseModule): """Implements feed-forward networks (FFNs) with identity connection. Args: embed_dims (int): The feature dimension. Same as `MultiheadAttention`. Defaults: 256. feedforward_channels (int): The hidden dimension of FFNs. Defaults: 1024. num_fcs (int, optional): The number of fully-connected layers in FFNs. Default: 2. act_cfg (dict, optional): The activation config for FFNs. Default: dict(type='ReLU') ffn_drop (float, optional): Probability of an element to be zeroed in FFN. Default 0.0. add_identity (bool, optional): Whether to add the identity connection. Default: `True`. dropout_layer (obj:`ConfigDict`): The dropout_layer used when adding the shortcut. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. layer_scale_init_value (float): Initial value of scale factor in LayerScale. Default: 1.0 """ @deprecated_api_warning( { 'dropout': 'ffn_drop', 'add_residual': 'add_identity' }, cls_name='FFN') def __init__(self, embed_dims=256, feedforward_channels=1024, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0., dropout_layer=None, add_identity=True, init_cfg=None, layer_scale_init_value=0.): super().__init__(init_cfg) assert num_fcs >= 2, 'num_fcs should be no less ' \ f'than 2. got {num_fcs}.' self.embed_dims = embed_dims self.feedforward_channels = feedforward_channels self.num_fcs = num_fcs layers = [] in_channels = embed_dims for _ in range(num_fcs - 1): layers.append( Sequential( Linear(in_channels, feedforward_channels), build_activation_layer(act_cfg), nn.Dropout(ffn_drop))) in_channels = feedforward_channels layers.append(Linear(feedforward_channels, embed_dims)) layers.append(nn.Dropout(ffn_drop)) self.layers = Sequential(*layers) self.dropout_layer = build_dropout( dropout_layer) if dropout_layer else torch.nn.Identity() self.add_identity = add_identity if layer_scale_init_value > 0: self.gamma2 = LayerScale(embed_dims, scale=layer_scale_init_value) else: self.gamma2 = nn.Identity() @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN') def forward(self, x, identity=None): """Forward function for `FFN`. The function would add x to the output tensor if residue is None. """ out = self.layers(x) out = self.gamma2(out) if not self.add_identity: return self.dropout_layer(out) if identity is None: identity = x return identity + self.dropout_layer(out) @MODELS.register_module() class BaseTransformerLayer(BaseModule): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False, **kwargs): deprecated_args = dict( feedforward_channels='feedforward_channels', ffn_dropout='ffn_drop', ffn_num_fcs='num_fcs') for ori_name, new_name in deprecated_args.items(): if ori_name in kwargs: warnings.warn( f'The arguments `{ori_name}` in BaseTransformerLayer ' f'has been deprecated, now you should set `{new_name}` ' f'and other FFN related arguments ' f'to a dict named `ffn_cfgs`. ', DeprecationWarning) ffn_cfgs[new_name] = kwargs[ori_name] super().__init__(init_cfg) self.batch_first = batch_first assert set(operation_order) & { 'self_attn', 'norm', 'ffn', 'cross_attn'} == \ set(operation_order), f'The operation_order of' \ f' {self.__class__.__name__} should ' \ f'contains all four operation type ' \ f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" num_attn = operation_order.count('self_attn') + operation_order.count( 'cross_attn') if isinstance(attn_cfgs, dict): attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] else: assert num_attn == len(attn_cfgs), f'The length ' \ f'of attn_cfg {num_attn} is ' \ f'not consistent with the number of attention' \ f'in operation_order {operation_order}.' self.num_attn = num_attn self.operation_order = operation_order self.norm_cfg = norm_cfg self.pre_norm = operation_order[0] == 'norm' self.attentions = ModuleList() index = 0 for operation_name in operation_order: if operation_name in ['self_attn', 'cross_attn']: if 'batch_first' in attn_cfgs[index]: assert self.batch_first == attn_cfgs[index]['batch_first'] else: attn_cfgs[index]['batch_first'] = self.batch_first attention = build_attention(attn_cfgs[index]) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 self.embed_dims = self.attentions[0].embed_dims self.ffns = ModuleList() num_ffns = operation_order.count('ffn') if isinstance(ffn_cfgs, dict): ffn_cfgs = ConfigDict(ffn_cfgs) if isinstance(ffn_cfgs, dict): ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( build_feedforward_network(ffn_cfgs[ffn_index], dict(type='FFN'))) self.norms = ModuleList() num_norms = operation_order.count('norm') for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) def forward(self, query, key=None, value=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': temp_key = temp_value = query query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=query_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query @MODELS.register_module() class TransformerLayerSequence(BaseModule): """Base class for TransformerEncoder and TransformerDecoder in vision transformer. As base-class of Encoder and Decoder in vision transformer. Support customization such as specifying different kind of `transformer_layer` in `transformer_coder`. Args: transformerlayer (list[obj:`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict`): Config of transformerlayer in TransformerCoder. If it is obj:`mmcv.ConfigDict`, it would be repeated `num_layer` times to a list[`mmcv.ConfigDict`]. Default: None. num_layers (int): The number of `TransformerLayer`. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): super().__init__(init_cfg) if isinstance(transformerlayers, dict): transformerlayers = [ copy.deepcopy(transformerlayers) for _ in range(num_layers) ] else: assert isinstance(transformerlayers, list) and \ len(transformerlayers) == num_layers self.num_layers = num_layers self.layers = ModuleList() for i in range(num_layers): self.layers.append(build_transformer_layer(transformerlayers[i])) self.embed_dims = self.layers[0].embed_dims self.pre_norm = self.layers[0].pre_norm def forward(self, query, key, value, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerCoder`. Args: query (Tensor): Input query with shape `(num_queries, bs, embed_dims)`. key (Tensor): The key tensor with shape `(num_keys, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_keys, bs, embed_dims)`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor], optional): Each element is 2D Tensor which is used in calculation of corresponding attention in operation_order. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in self-attention Default: None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: results with shape [num_queries, bs, embed_dims]. """ for layer in self.layers: query = layer( query, key, value, query_pos=query_pos, key_pos=key_pos, attn_masks=attn_masks, query_key_padding_mask=query_key_padding_mask, key_padding_mask=key_padding_mask, **kwargs) return query ================================================ FILE: mmcv/cnn/bricks/upsample.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import inspect from typing import Dict import torch import torch.nn as nn import torch.nn.functional as F from mmengine.model import xavier_init from mmengine.registry import MODELS MODELS.register_module('nearest', module=nn.Upsample) MODELS.register_module('bilinear', module=nn.Upsample) @MODELS.register_module(name='pixel_shuffle') class PixelShufflePack(nn.Module): """Pixel Shuffle upsample layer. This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to achieve a simple upsampling with pixel shuffle. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. scale_factor (int): Upsample ratio. upsample_kernel (int): Kernel size of the conv layer to expand the channels. """ def __init__(self, in_channels: int, out_channels: int, scale_factor: int, upsample_kernel: int): super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.scale_factor = scale_factor self.upsample_kernel = upsample_kernel self.upsample_conv = nn.Conv2d( self.in_channels, self.out_channels * scale_factor * scale_factor, self.upsample_kernel, padding=(self.upsample_kernel - 1) // 2) self.init_weights() def init_weights(self): xavier_init(self.upsample_conv, distribution='uniform') def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.upsample_conv(x) x = F.pixel_shuffle(x, self.scale_factor) return x def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module: """Build upsample layer. Args: cfg (dict): The upsample layer config, which should contain: - type (str): Layer type. - scale_factor (int): Upsample ratio, which is not applicable to deconv. - layer args: Args needed to instantiate a upsample layer. args (argument list): Arguments passed to the ``__init__`` method of the corresponding conv layer. kwargs (keyword arguments): Keyword arguments passed to the ``__init__`` method of the corresponding conv layer. Returns: nn.Module: Created upsample layer. """ if not isinstance(cfg, dict): raise TypeError(f'cfg must be a dict, but got {type(cfg)}') if 'type' not in cfg: raise KeyError( f'the cfg dict must contain the key "type", but got {cfg}') cfg_ = cfg.copy() layer_type = cfg_.pop('type') if inspect.isclass(layer_type): upsample = layer_type # Switch registry to the target scope. If `upsample` cannot be found # in the registry, fallback to search `upsample` in the # mmengine.MODELS. else: with MODELS.switch_scope_and_registry(None) as registry: upsample = registry.get(layer_type) if upsample is None: raise KeyError(f'Cannot find {upsample} in registry under scope ' f'name {registry.scope}') if upsample is nn.Upsample: cfg_['mode'] = layer_type layer = upsample(*args, **kwargs, **cfg_) return layer ================================================ FILE: mmcv/cnn/bricks/wrappers.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501 Wrap some nn modules to support empty tensor input. Currently, these wrappers are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask heads are trained on only positive RoIs. """ import math import torch import torch.nn as nn from mmengine.registry import MODELS from torch.nn.modules.utils import _pair, _triple if torch.__version__ == 'parrots': TORCH_VERSION = torch.__version__ else: # torch.__version__ could be 1.3.1+cu92, we only need the first two # for comparison TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2]) def obsolete_torch_version(torch_version, version_threshold) -> bool: return torch_version == 'parrots' or torch_version <= version_threshold class NewEmptyTensorOp(torch.autograd.Function): @staticmethod def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor: ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod def backward(ctx, grad: torch.Tensor) -> tuple: shape = ctx.shape return NewEmptyTensorOp.apply(grad, shape), None @MODELS.register_module('Conv', force=True) class Conv2d(nn.Conv2d): def forward(self, x: torch.Tensor) -> torch.Tensor: if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size, self.padding, self.stride, self.dilation): o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 out_shape.append(o) empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) @MODELS.register_module('Conv3d', force=True) class Conv3d(nn.Conv3d): def forward(self, x: torch.Tensor) -> torch.Tensor: if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size, self.padding, self.stride, self.dilation): o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 out_shape.append(o) empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) @MODELS.register_module() @MODELS.register_module('deconv') class ConvTranspose2d(nn.ConvTranspose2d): def forward(self, x: torch.Tensor) -> torch.Tensor: if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size, self.padding, self.stride, self.dilation, self.output_padding): out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) @MODELS.register_module() @MODELS.register_module('deconv3d') class ConvTranspose3d(nn.ConvTranspose3d): def forward(self, x: torch.Tensor) -> torch.Tensor: if obsolete_torch_version(TORCH_VERSION, (1, 4)) and x.numel() == 0: out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size, self.padding, self.stride, self.dilation, self.output_padding): out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) class MaxPool2d(nn.MaxPool2d): def forward(self, x: torch.Tensor) -> torch.Tensor: # PyTorch 1.9 does not support empty tensor inference yet if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0: out_shape = list(x.shape[:2]) for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size), _pair(self.padding), _pair(self.stride), _pair(self.dilation)): o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 o = math.ceil(o) if self.ceil_mode else math.floor(o) out_shape.append(o) empty = NewEmptyTensorOp.apply(x, out_shape) return empty return super().forward(x) class MaxPool3d(nn.MaxPool3d): def forward(self, x: torch.Tensor) -> torch.Tensor: # PyTorch 1.9 does not support empty tensor inference yet if obsolete_torch_version(TORCH_VERSION, (1, 9)) and x.numel() == 0: out_shape = list(x.shape[:2]) for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size), _triple(self.padding), _triple(self.stride), _triple(self.dilation)): o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 o = math.ceil(o) if self.ceil_mode else math.floor(o) out_shape.append(o) empty = NewEmptyTensorOp.apply(x, out_shape) return empty return super().forward(x) class Linear(torch.nn.Linear): def forward(self, x: torch.Tensor) -> torch.Tensor: # empty tensor forward of Linear layer is supported in Pytorch 1.6 if obsolete_torch_version(TORCH_VERSION, (1, 5)) and x.numel() == 0: out_shape = [x.shape[0], self.out_features] empty = NewEmptyTensorOp.apply(x, out_shape) if self.training: # produce dummy gradient to avoid DDP warning. dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 return empty + dummy else: return empty return super().forward(x) ================================================ FILE: mmcv/cnn/resnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import logging from typing import Optional, Sequence, Tuple, Union import torch.nn as nn import torch.utils.checkpoint as cp from mmengine.model import constant_init, kaiming_init from mmengine.runner import load_checkpoint from torch import Tensor def conv3x3(in_planes: int, out_planes: int, stride: int = 1, dilation: int = 1): """3x3 convolution with padding.""" return nn.Conv2d( in_planes, out_planes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes: int, planes: int, stride: int = 1, dilation: int = 1, downsample: Optional[nn.Module] = None, style: str = 'pytorch', with_cp: bool = False): super().__init__() assert style in ['pytorch', 'caffe'] self.conv1 = conv3x3(inplanes, planes, stride, dilation) self.bn1 = nn.BatchNorm2d(planes) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes) self.downsample = downsample self.stride = stride self.dilation = dilation assert not with_cp def forward(self, x: Tensor) -> Tensor: residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes: int, planes: int, stride: int = 1, dilation: int = 1, downsample: Optional[nn.Module] = None, style: str = 'pytorch', with_cp: bool = False): """Bottleneck block. If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is "caffe", the stride-two layer is the first 1x1 conv layer. """ super().__init__() assert style in ['pytorch', 'caffe'] if style == 'pytorch': conv1_stride = 1 conv2_stride = stride else: conv1_stride = stride conv2_stride = 1 self.conv1 = nn.Conv2d( inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False) self.conv2 = nn.Conv2d( planes, planes, kernel_size=3, stride=conv2_stride, padding=dilation, dilation=dilation, bias=False) self.bn1 = nn.BatchNorm2d(planes) self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d( planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride self.dilation = dilation self.with_cp = with_cp def forward(self, x: Tensor) -> Tensor: def _inner_forward(x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual return out if self.with_cp and x.requires_grad: out = cp.checkpoint(_inner_forward, x) else: out = _inner_forward(x) out = self.relu(out) return out def make_res_layer(block: nn.Module, inplanes: int, planes: int, blocks: int, stride: int = 1, dilation: int = 1, style: str = 'pytorch', with_cp: bool = False) -> nn.Module: downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d( inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion), ) layers = [] layers.append( block( inplanes, planes, stride, dilation, downsample, style=style, with_cp=with_cp)) inplanes = planes * block.expansion for _ in range(1, blocks): layers.append( block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp)) return nn.Sequential(*layers) class ResNet(nn.Module): """ResNet backbone. Args: depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. num_stages (int): Resnet stages, normally 4. strides (Sequence[int]): Strides of the first block of each stage. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is the first 1x1 conv layer. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze running stats (mean and var). bn_frozen (bool): Whether to freeze weight and bias of BN layers. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. """ arch_settings = { 18: (BasicBlock, (2, 2, 2, 2)), 34: (BasicBlock, (3, 4, 6, 3)), 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)) } def __init__(self, depth: int, num_stages: int = 4, strides: Sequence[int] = (1, 2, 2, 2), dilations: Sequence[int] = (1, 1, 1, 1), out_indices: Sequence[int] = (0, 1, 2, 3), style: str = 'pytorch', frozen_stages: int = -1, bn_eval: bool = True, bn_frozen: bool = False, with_cp: bool = False): super().__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for resnet') assert num_stages >= 1 and num_stages <= 4 block, stage_blocks = self.arch_settings[depth] stage_blocks = stage_blocks[:num_stages] # type: ignore assert len(strides) == len(dilations) == num_stages assert max(out_indices) < num_stages self.out_indices = out_indices self.style = style self.frozen_stages = frozen_stages self.bn_eval = bn_eval self.bn_frozen = bn_frozen self.with_cp = with_cp self.inplanes: int = 64 self.conv1 = nn.Conv2d( 3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.res_layers = [] for i, num_blocks in enumerate(stage_blocks): stride = strides[i] dilation = dilations[i] planes = 64 * 2**i res_layer = make_res_layer( block, self.inplanes, planes, num_blocks, stride=stride, dilation=dilation, style=self.style, with_cp=with_cp) self.inplanes = planes * block.expansion # type: ignore layer_name = f'layer{i + 1}' self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) self.feat_dim = block.expansion * 64 * 2**( # type: ignore len(stage_blocks) - 1) def init_weights(self, pretrained: Optional[str] = None) -> None: if isinstance(pretrained, str): logger = logging.getLogger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, nn.BatchNorm2d): constant_init(m, 1) else: raise TypeError('pretrained must be a str or None') def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]: x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) outs = [] for i, layer_name in enumerate(self.res_layers): res_layer = getattr(self, layer_name) x = res_layer(x) if i in self.out_indices: outs.append(x) if len(outs) == 1: return outs[0] else: return tuple(outs) def train(self, mode: bool = True) -> None: super().train(mode) if self.bn_eval: for m in self.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() if self.bn_frozen: for params in m.parameters(): params.requires_grad = False if mode and self.frozen_stages >= 0: for param in self.conv1.parameters(): param.requires_grad = False for param in self.bn1.parameters(): param.requires_grad = False self.bn1.eval() self.bn1.weight.requires_grad = False self.bn1.bias.requires_grad = False for i in range(1, self.frozen_stages + 1): mod = getattr(self, f'layer{i}') mod.eval() for param in mod.parameters(): param.requires_grad = False ================================================ FILE: mmcv/cnn/rfsearch/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp from .search import RFSearchHook __all__ = ['BaseConvRFSearchOp', 'Conv2dRFSearchOp', 'RFSearchHook'] ================================================ FILE: mmcv/cnn/rfsearch/operator.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import numpy as np import torch import torch.nn as nn from mmengine.logging import print_log from mmengine.model import BaseModule from torch import Tensor from .utils import expand_rates, get_single_padding class BaseConvRFSearchOp(BaseModule): """Based class of ConvRFSearchOp. Args: op_layer (nn.Module): pytorch module, e,g, Conv2d global_config (dict): config dict. """ def __init__(self, op_layer: nn.Module, global_config: dict): super().__init__() self.op_layer = op_layer self.global_config = global_config def normlize(self, weights: nn.Parameter) -> nn.Parameter: """Normalize weights. Args: weights (nn.Parameter): Weights to be normalized. Returns: nn.Parameters: Normalized weights. """ abs_weights = torch.abs(weights) normalized_weights = abs_weights / torch.sum(abs_weights) return normalized_weights class Conv2dRFSearchOp(BaseConvRFSearchOp): """Enable Conv2d with receptive field searching ability. Args: op_layer (nn.Module): pytorch module, e,g, Conv2d global_config (dict): config dict. Defaults to None. By default this must include: - "init_alphas": The value for initializing weights of each branch. - "num_branches": The controller of the size of search space (the number of branches). - "exp_rate": The controller of the sparsity of search space. - "mmin": The minimum dilation rate. - "mmax": The maximum dilation rate. Extra keys may exist, but are used by RFSearchHook, e.g., "step", "max_step", "search_interval", and "skip_layer". verbose (bool): Determines whether to print rf-next related logging messages. Defaults to True. """ def __init__(self, op_layer: nn.Module, global_config: dict, verbose: bool = True): super().__init__(op_layer, global_config) assert global_config is not None, 'global_config is None' self.num_branches = global_config['num_branches'] assert self.num_branches in [2, 3] self.verbose = verbose init_dilation = op_layer.dilation self.dilation_rates = expand_rates(init_dilation, global_config) if self.op_layer.kernel_size[ 0] == 1 or self.op_layer.kernel_size[0] % 2 == 0: self.dilation_rates = [(op_layer.dilation[0], r[1]) for r in self.dilation_rates] if self.op_layer.kernel_size[ 1] == 1 or self.op_layer.kernel_size[1] % 2 == 0: self.dilation_rates = [(r[0], op_layer.dilation[1]) for r in self.dilation_rates] self.branch_weights = nn.Parameter(torch.Tensor(self.num_branches)) if self.verbose: print_log(f'Expand as {self.dilation_rates}', 'current') nn.init.constant_(self.branch_weights, global_config['init_alphas']) def forward(self, input: Tensor) -> Tensor: norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)]) if len(self.dilation_rates) == 1: outputs = [ nn.functional.conv2d( input, weight=self.op_layer.weight, bias=self.op_layer.bias, stride=self.op_layer.stride, padding=self.get_padding(self.dilation_rates[0]), dilation=self.dilation_rates[0], groups=self.op_layer.groups, ) ] else: outputs = [ nn.functional.conv2d( input, weight=self.op_layer.weight, bias=self.op_layer.bias, stride=self.op_layer.stride, padding=self.get_padding(r), dilation=r, groups=self.op_layer.groups, ) * norm_w[i] for i, r in enumerate(self.dilation_rates) ] output = outputs[0] for i in range(1, len(self.dilation_rates)): output += outputs[i] return output def estimate_rates(self) -> None: """Estimate new dilation rate based on trained branch_weights.""" norm_w = self.normlize(self.branch_weights[:len(self.dilation_rates)]) if self.verbose: print_log( 'Estimate dilation {} with weight {}.'.format( self.dilation_rates, norm_w.detach().cpu().numpy().tolist()), 'current') sum0, sum1, w_sum = 0, 0, 0 for i in range(len(self.dilation_rates)): sum0 += norm_w[i].item() * self.dilation_rates[i][0] sum1 += norm_w[i].item() * self.dilation_rates[i][1] w_sum += norm_w[i].item() estimated = [ np.clip( int(round(sum0 / w_sum)), self.global_config['mmin'], self.global_config['mmax']).item(), np.clip( int(round(sum1 / w_sum)), self.global_config['mmin'], self.global_config['mmax']).item() ] self.op_layer.dilation = tuple(estimated) self.op_layer.padding = self.get_padding(self.op_layer.dilation) self.dilation_rates = [tuple(estimated)] if self.verbose: print_log(f'Estimate as {tuple(estimated)}', 'current') def expand_rates(self) -> None: """Expand dilation rate.""" dilation = self.op_layer.dilation dilation_rates = expand_rates(dilation, self.global_config) if self.op_layer.kernel_size[ 0] == 1 or self.op_layer.kernel_size[0] % 2 == 0: dilation_rates = [(dilation[0], r[1]) for r in dilation_rates] if self.op_layer.kernel_size[ 1] == 1 or self.op_layer.kernel_size[1] % 2 == 0: dilation_rates = [(r[0], dilation[1]) for r in dilation_rates] self.dilation_rates = copy.deepcopy(dilation_rates) if self.verbose: print_log(f'Expand as {self.dilation_rates}', 'current') nn.init.constant_(self.branch_weights, self.global_config['init_alphas']) def get_padding(self, dilation) -> tuple: padding = (get_single_padding(self.op_layer.kernel_size[0], self.op_layer.stride[0], dilation[0]), get_single_padding(self.op_layer.kernel_size[1], self.op_layer.stride[1], dilation[1])) return padding ================================================ FILE: mmcv/cnn/rfsearch/search.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os from typing import Dict, Optional import mmengine import torch # noqa import torch.nn as nn from mmengine.hooks import Hook from mmengine.logging import print_log from mmengine.registry import HOOKS from .operator import BaseConvRFSearchOp, Conv2dRFSearchOp # noqa from .utils import get_single_padding, write_to_json @HOOKS.register_module() class RFSearchHook(Hook): """Rcecptive field search via dilation rates. Please refer to `RF-Next: Efficient Receptive Field Search for Convolutional Neural Networks `_ for more details. Args: mode (str, optional): It can be set to the following types: 'search', 'fixed_single_branch', or 'fixed_multi_branch'. Defaults to 'search'. config (Dict, optional): config dict of search. By default this config contains "search", and config["search"] must include: - "step": recording the current searching step. - "max_step": The maximum number of searching steps to update the structures. - "search_interval": The interval (epoch/iteration) between two updates. - "exp_rate": The controller of the sparsity of search space. - "init_alphas": The value for initializing weights of each branch. - "mmin": The minimum dilation rate. - "mmax": The maximum dilation rate. - "num_branches": The controller of the size of search space (the number of branches). - "skip_layer": The modules in skip_layer will be ignored during the receptive field search. rfstructure_file (str, optional): Path to load searched receptive fields of the model. Defaults to None. by_epoch (bool, optional): Determine to perform step by epoch or by iteration. If set to True, it will step by epoch. Otherwise, by iteration. Defaults to True. verbose (bool): Determines whether to print rf-next related logging messages. Defaults to True. """ def __init__(self, mode: str = 'search', config: Dict = {}, rfstructure_file: Optional[str] = None, by_epoch: bool = True, verbose: bool = True): assert mode in ['search', 'fixed_single_branch', 'fixed_multi_branch'] assert config is not None self.config = config self.config['structure'] = {} self.verbose = verbose if rfstructure_file is not None: rfstructure = mmengine.load(rfstructure_file)['structure'] self.config['structure'] = rfstructure self.mode = mode self.num_branches = self.config['search']['num_branches'] self.by_epoch = by_epoch def init_model(self, model: nn.Module): """Init model with search ability. Args: model (nn.Module): pytorch model Raises: NotImplementedError: only support three modes: search/fixed_single_branch/fixed_multi_branch """ if self.verbose: print_log('RFSearch init begin.', 'current') if self.mode == 'search': if self.config['structure']: self.set_model(model, search_op='Conv2d') self.wrap_model(model, search_op='Conv2d') elif self.mode == 'fixed_single_branch': self.set_model(model, search_op='Conv2d') elif self.mode == 'fixed_multi_branch': self.set_model(model, search_op='Conv2d') self.wrap_model(model, search_op='Conv2d') else: raise NotImplementedError if self.verbose: print_log('RFSearch init end.', 'current') def after_train_epoch(self, runner): """Performs a dilation searching step after one training epoch.""" if self.by_epoch and self.mode == 'search': self.step(runner.model, runner.work_dir) def after_train_iter(self, runner, batch_idx, data_batch, outputs): """Performs a dilation searching step after one training iteration.""" if not self.by_epoch and self.mode == 'search': self.step(runner.model, runner.work_dir) def step(self, model: nn.Module, work_dir: str) -> None: """Performs a dilation searching step. Args: model (nn.Module): pytorch model work_dir (str): Directory to save the searching results. """ self.config['search']['step'] += 1 if (self.config['search']['step'] ) % self.config['search']['search_interval'] == 0 and (self.config[ 'search']['step']) < self.config['search']['max_step']: self.estimate_and_expand(model) for name, module in model.named_modules(): if isinstance(module, BaseConvRFSearchOp): self.config['structure'][name] = module.op_layer.dilation write_to_json( self.config, os.path.join( work_dir, 'local_search_config_step%d.json' % self.config['search']['step'], ), ) def estimate_and_expand(self, model: nn.Module) -> None: """Estimate and search for RFConvOp. Args: model (nn.Module): pytorch model """ for module in model.modules(): if isinstance(module, BaseConvRFSearchOp): module.estimate_rates() module.expand_rates() def wrap_model(self, model: nn.Module, search_op: str = 'Conv2d', prefix: str = '') -> None: """Wrap model to support searchable conv op. Args: model (nn.Module): pytorch model search_op (str): The module that uses RF search. Defaults to 'Conv2d'. init_rates (int, optional): Set to other initial dilation rates. Defaults to None. prefix (str): Prefix for function recursion. Defaults to ''. """ op = 'torch.nn.' + search_op for name, module in model.named_children(): if prefix == '': fullname = 'module.' + name else: fullname = prefix + '.' + name if self.config['search']['skip_layer'] is not None: if any(layer in fullname for layer in self.config['search']['skip_layer']): continue if isinstance(module, eval(op)): if 1 < module.kernel_size[0] and \ 0 != module.kernel_size[0] % 2 or \ 1 < module.kernel_size[1] and \ 0 != module.kernel_size[1] % 2: moduleWrap = eval(search_op + 'RFSearchOp')( module, self.config['search'], self.verbose) moduleWrap = moduleWrap.to(module.weight.device) if self.verbose: print_log( 'Wrap model %s to %s.' % (str(module), str(moduleWrap)), 'current') setattr(model, name, moduleWrap) elif not isinstance(module, BaseConvRFSearchOp): self.wrap_model(module, search_op, fullname) def set_model(self, model: nn.Module, search_op: str = 'Conv2d', init_rates: Optional[int] = None, prefix: str = '') -> None: """Set model based on config. Args: model (nn.Module): pytorch model config (Dict): config file search_op (str): The module that uses RF search. Defaults to 'Conv2d'. init_rates (int, optional): Set to other initial dilation rates. Defaults to None. prefix (str): Prefix for function recursion. Defaults to ''. """ op = 'torch.nn.' + search_op for name, module in model.named_children(): if prefix == '': fullname = 'module.' + name else: fullname = prefix + '.' + name if self.config['search']['skip_layer'] is not None: if any(layer in fullname for layer in self.config['search']['skip_layer']): continue if isinstance(module, eval(op)): if 1 < module.kernel_size[0] and \ 0 != module.kernel_size[0] % 2 or \ 1 < module.kernel_size[1] and \ 0 != module.kernel_size[1] % 2: if isinstance(self.config['structure'][fullname], int): self.config['structure'][fullname] = [ self.config['structure'][fullname], self.config['structure'][fullname] ] module.dilation = ( self.config['structure'][fullname][0], self.config['structure'][fullname][1], ) module.padding = ( get_single_padding( module.kernel_size[0], module.stride[0], self.config['structure'][fullname][0]), get_single_padding( module.kernel_size[1], module.stride[1], self.config['structure'][fullname][1])) setattr(model, name, module) if self.verbose: print_log( 'Set module %s dilation as: [%d %d]' % (fullname, module.dilation[0], module.dilation[1]), 'current') elif not isinstance(module, BaseConvRFSearchOp): self.set_model(module, search_op, init_rates, fullname) ================================================ FILE: mmcv/cnn/rfsearch/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import mmengine import numpy as np def write_to_json(config: dict, filename: str): """Save config to json file. Args: config (dict): Config to be saved. filename (str): Path to save config. """ with open(filename, 'w', encoding='utf-8') as f: mmengine.dump(config, f, file_format='json') def expand_rates(dilation: tuple, config: dict) -> list: """Expand dilation rate according to config. Args: dilation (int): _description_ config (dict): config dict Returns: list: list of expanded dilation rates """ exp_rate = config['exp_rate'] large_rates = [] small_rates = [] for _ in range(config['num_branches'] // 2): large_rates.append( tuple([ np.clip( int(round((1 + exp_rate) * dilation[0])), config['mmin'], config['mmax']).item(), np.clip( int(round((1 + exp_rate) * dilation[1])), config['mmin'], config['mmax']).item() ])) small_rates.append( tuple([ np.clip( int(round((1 - exp_rate) * dilation[0])), config['mmin'], config['mmax']).item(), np.clip( int(round((1 - exp_rate) * dilation[1])), config['mmin'], config['mmax']).item() ])) small_rates.reverse() if config['num_branches'] % 2 == 0: rate_list = small_rates + large_rates else: rate_list = small_rates + [dilation] + large_rates unique_rate_list = list(set(rate_list)) unique_rate_list.sort(key=rate_list.index) return unique_rate_list def get_single_padding(kernel_size: int, stride: int = 1, dilation: int = 1) -> int: padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 return padding ================================================ FILE: mmcv/cnn/utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .flops_counter import get_model_complexity_info from .fuse_conv_bn import fuse_conv_bn __all__ = ['get_model_complexity_info', 'fuse_conv_bn'] ================================================ FILE: mmcv/cnn/utils/flops_counter.py ================================================ # Modified from flops-counter.pytorch by Vladislav Sovrasov # original repo: https://github.com/sovrasov/flops-counter.pytorch # MIT License # Copyright (c) 2018 Vladislav Sovrasov # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import sys import warnings from functools import partial from typing import Any, Callable, Dict, Optional, TextIO, Tuple import numpy as np import torch import torch.nn as nn from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, Linear, MaxPool2d, MaxPool3d) def get_model_complexity_info(model: nn.Module, input_shape: tuple, print_per_layer_stat: bool = True, as_strings: bool = True, input_constructor: Optional[Callable] = None, flush: bool = False, ost: TextIO = sys.stdout) -> tuple: """Get complexity information of a model. This method can calculate FLOPs and parameter counts of a model with corresponding input shape. It can also print complexity information for each layer in a model. Supported layers are listed as below: - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``. - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``, ``nn.ReLU6``. - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``, ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``, ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``, ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``, ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``. - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``, ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``, ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``. - Linear: ``nn.Linear``. - Deconvolution: ``nn.ConvTranspose2d``. - Upsample: ``nn.Upsample``. Args: model (nn.Module): The model for complexity calculation. input_shape (tuple): Input shape used for calculation. print_per_layer_stat (bool): Whether to print complexity information for each layer in a model. Default: True. as_strings (bool): Output FLOPs and params counts in a string form. Default: True. input_constructor (None | callable): If specified, it takes a callable method that generates input. otherwise, it will generate a random tensor with input shape to calculate FLOPs. Default: None. flush (bool): same as that in :func:`print`. Default: False. ost (stream): same as ``file`` param in :func:`print`. Default: sys.stdout. Returns: tuple[float | str]: If ``as_strings`` is set to True, it will return FLOPs and parameter counts in a string format. otherwise, it will return those in a float number format. """ assert type(input_shape) is tuple assert len(input_shape) >= 1 assert isinstance(model, nn.Module) flops_model = add_flops_counting_methods(model) flops_model.eval() flops_model.start_flops_count() if input_constructor: input = input_constructor(input_shape) _ = flops_model(**input) else: try: batch = torch.ones(()).new_empty( (1, *input_shape), dtype=next(flops_model.parameters()).dtype, device=next(flops_model.parameters()).device) except StopIteration: # Avoid StopIteration for models which have no parameters, # like `nn.Relu()`, `nn.AvgPool2d`, etc. batch = torch.ones(()).new_empty((1, *input_shape)) _ = flops_model(batch) flops_count, params_count = flops_model.compute_average_flops_cost() if print_per_layer_stat: print_model_with_flops( flops_model, flops_count, params_count, ost=ost, flush=flush) flops_model.stop_flops_count() if as_strings: return flops_to_string(flops_count), params_to_string(params_count) return flops_count, params_count def flops_to_string(flops: float, units: Optional[str] = 'GFLOPs', precision: int = 2) -> str: """Convert FLOPs number into a string. Note that Here we take a multiply-add counts as one FLOP. Args: flops (float): FLOPs number to be converted. units (str | None): Converted FLOPs units. Options are None, 'GFLOPs', 'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically choose the most suitable unit for FLOPs. Default: 'GFLOPs'. precision (int): Digit number after the decimal point. Default: 2. Returns: str: The converted FLOPs number with units. Examples: >>> flops_to_string(1e9) '1.0 GFLOPs' >>> flops_to_string(2e5, 'MFLOPs') '0.2 MFLOPs' >>> flops_to_string(3e-9, None) '3e-09 FLOPs' """ if units is None: if flops // 10**9 > 0: return str(round(flops / 10.**9, precision)) + ' GFLOPs' elif flops // 10**6 > 0: return str(round(flops / 10.**6, precision)) + ' MFLOPs' elif flops // 10**3 > 0: return str(round(flops / 10.**3, precision)) + ' KFLOPs' else: return str(flops) + ' FLOPs' else: if units == 'GFLOPs': return str(round(flops / 10.**9, precision)) + ' ' + units elif units == 'MFLOPs': return str(round(flops / 10.**6, precision)) + ' ' + units elif units == 'KFLOPs': return str(round(flops / 10.**3, precision)) + ' ' + units else: return str(flops) + ' FLOPs' def params_to_string(num_params: float, units: Optional[str] = None, precision: int = 2) -> str: """Convert parameter number into a string. Args: num_params (float): Parameter number to be converted. units (str | None): Converted FLOPs units. Options are None, 'M', 'K' and ''. If set to None, it will automatically choose the most suitable unit for Parameter number. Default: None. precision (int): Digit number after the decimal point. Default: 2. Returns: str: The converted parameter number with units. Examples: >>> params_to_string(1e9) '1000.0 M' >>> params_to_string(2e5) '200.0 k' >>> params_to_string(3e-9) '3e-09' """ if units is None: if num_params // 10**6 > 0: return str(round(num_params / 10**6, precision)) + ' M' elif num_params // 10**3: return str(round(num_params / 10**3, precision)) + ' k' else: return str(num_params) else: if units == 'M': return str(round(num_params / 10.**6, precision)) + ' ' + units elif units == 'K': return str(round(num_params / 10.**3, precision)) + ' ' + units else: return str(num_params) def print_model_with_flops(model: nn.Module, total_flops: float, total_params: float, units: Optional[str] = 'GFLOPs', precision: int = 3, ost: TextIO = sys.stdout, flush: bool = False) -> None: """Print a model with FLOPs for each layer. Args: model (nn.Module): The model to be printed. total_flops (float): Total FLOPs of the model. total_params (float): Total parameter counts of the model. units (str | None): Converted FLOPs units. Default: 'GFLOPs'. precision (int): Digit number after the decimal point. Default: 3. ost (stream): same as `file` param in :func:`print`. Default: sys.stdout. flush (bool): same as that in :func:`print`. Default: False. Example: >>> class ExampleModel(nn.Module): >>> def __init__(self): >>> super().__init__() >>> self.conv1 = nn.Conv2d(3, 8, 3) >>> self.conv2 = nn.Conv2d(8, 256, 3) >>> self.conv3 = nn.Conv2d(256, 8, 3) >>> self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) >>> self.flatten = nn.Flatten() >>> self.fc = nn.Linear(8, 1) >>> def forward(self, x): >>> x = self.conv1(x) >>> x = self.conv2(x) >>> x = self.conv3(x) >>> x = self.avg_pool(x) >>> x = self.flatten(x) >>> x = self.fc(x) >>> return x >>> model = ExampleModel() >>> x = (3, 16, 16) to print the complexity information state for each layer, you can use >>> get_model_complexity_info(model, x) or directly use >>> print_model_with_flops(model, 4579784.0, 37361) ExampleModel( 0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs, (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1)) # noqa: E501 (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1)) (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1)) (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1)) (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, ) (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True) ) """ def accumulate_params(self): if is_supported_instance(self): return self.__params__ else: sum = 0 for m in self.children(): sum += m.accumulate_params() return sum def accumulate_flops(self): if is_supported_instance(self): return self.__flops__ / model.__batch_counter__ else: sum = 0 for m in self.children(): sum += m.accumulate_flops() return sum def flops_repr(self): accumulated_num_params = self.accumulate_params() accumulated_flops_cost = self.accumulate_flops() return ', '.join([ params_to_string( accumulated_num_params, units='M', precision=precision), f'{accumulated_num_params / total_params:.3%} Params', flops_to_string( accumulated_flops_cost, units=units, precision=precision), f'{accumulated_flops_cost / total_flops:.3%} FLOPs', self.original_extra_repr() ]) def add_extra_repr(m): m.accumulate_flops = accumulate_flops.__get__(m) m.accumulate_params = accumulate_params.__get__(m) flops_extra_repr = flops_repr.__get__(m) if m.extra_repr != flops_extra_repr: m.original_extra_repr = m.extra_repr m.extra_repr = flops_extra_repr assert m.extra_repr != m.original_extra_repr def del_extra_repr(m): if hasattr(m, 'original_extra_repr'): m.extra_repr = m.original_extra_repr del m.original_extra_repr if hasattr(m, 'accumulate_flops'): del m.accumulate_flops model.apply(add_extra_repr) print(model, file=ost, flush=flush) model.apply(del_extra_repr) def get_model_parameters_number(model: nn.Module) -> float: """Calculate parameter number of a model. Args: model (nn.module): The model for parameter number calculation. Returns: float: Parameter number of the model. """ num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) return num_params def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module: # adding additional methods to the existing module object, # this is done this way so that each function has access to self object net_main_module.start_flops_count = start_flops_count.__get__( # type: ignore # noqa E501 net_main_module) net_main_module.stop_flops_count = stop_flops_count.__get__( # type: ignore # noqa E501 net_main_module) net_main_module.reset_flops_count = reset_flops_count.__get__( # type: ignore # noqa E501 net_main_module) net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # type: ignore # noqa E501 net_main_module) net_main_module.reset_flops_count() return net_main_module def compute_average_flops_cost(self) -> Tuple[float, float]: """Compute average FLOPs cost. A method to compute average FLOPs cost, which will be available after `add_flops_counting_methods()` is called on a desired net object. Returns: float: Current mean flops consumption per image. """ batches_count = self.__batch_counter__ flops_sum = 0 for module in self.modules(): if is_supported_instance(module): flops_sum += module.__flops__ params_sum = get_model_parameters_number(self) return flops_sum / batches_count, params_sum def start_flops_count(self) -> None: """Activate the computation of mean flops consumption per image. A method to activate the computation of mean flops consumption per image. which will be available after ``add_flops_counting_methods()`` is called on a desired net object. It should be called before running the network. """ add_batch_counter_hook_function(self) def add_flops_counter_hook_function(module: nn.Module) -> None: if is_supported_instance(module): if hasattr(module, '__flops_handle__'): return else: handle = module.register_forward_hook( get_modules_mapping()[type(module)]) module.__flops_handle__ = handle self.apply(partial(add_flops_counter_hook_function)) def stop_flops_count(self) -> None: """Stop computing the mean flops consumption per image. A method to stop computing the mean flops consumption per image, which will be available after ``add_flops_counting_methods()`` is called on a desired net object. It can be called to pause the computation whenever. """ remove_batch_counter_hook_function(self) self.apply(remove_flops_counter_hook_function) def reset_flops_count(self) -> None: """Reset statistics computed so far. A method to Reset computed statistics, which will be available after `add_flops_counting_methods()` is called on a desired net object. """ add_batch_counter_variables_or_reset(self) self.apply(add_flops_counter_variable_or_reset) # ---- Internal functions def empty_flops_counter_hook(module: nn.Module, input: tuple, output: Any) -> None: module.__flops__ += 0 def upsample_flops_counter_hook(module: nn.Module, input: tuple, output: torch.Tensor) -> None: output_size = output[0] batch_size = output_size.shape[0] output_elements_count = batch_size for val in output_size.shape[1:]: output_elements_count *= val module.__flops__ += int(output_elements_count) def relu_flops_counter_hook(module: nn.Module, input: tuple, output: torch.Tensor) -> None: active_elements_count = output.numel() module.__flops__ += int(active_elements_count) def linear_flops_counter_hook(module: nn.Module, input: tuple, output: torch.Tensor) -> None: output_last_dim = output.shape[ -1] # pytorch checks dimensions, so here we don't care much module.__flops__ += int(np.prod(input[0].shape) * output_last_dim) def pool_flops_counter_hook(module: nn.Module, input: tuple, output: torch.Tensor) -> None: module.__flops__ += int(np.prod(input[0].shape)) def norm_flops_counter_hook(module: nn.Module, input: tuple, output: torch.Tensor) -> None: batch_flops = np.prod(input[0].shape) if (getattr(module, 'affine', False) or getattr(module, 'elementwise_affine', False)): batch_flops *= 2 module.__flops__ += int(batch_flops) def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple, output: torch.Tensor) -> None: # Can have multiple inputs, getting the first one batch_size = input[0].shape[0] input_height, input_width = input[0].shape[2:] kernel_height, kernel_width = conv_module.kernel_size in_channels = conv_module.in_channels out_channels = conv_module.out_channels groups = conv_module.groups filters_per_channel = out_channels // groups conv_per_position_flops = ( kernel_height * kernel_width * in_channels * filters_per_channel) active_elements_count = batch_size * input_height * input_width overall_conv_flops = conv_per_position_flops * active_elements_count bias_flops = 0 if conv_module.bias is not None: output_height, output_width = output.shape[2:] bias_flops = out_channels * batch_size * output_height * output_width overall_flops = overall_conv_flops + bias_flops conv_module.__flops__ += int(overall_flops) def conv_flops_counter_hook(conv_module: nn.Module, input: tuple, output: torch.Tensor) -> None: # Can have multiple inputs, getting the first one batch_size = input[0].shape[0] output_dims = list(output.shape[2:]) kernel_dims = list(conv_module.kernel_size) in_channels = conv_module.in_channels out_channels = conv_module.out_channels groups = conv_module.groups filters_per_channel = out_channels // groups conv_per_position_flops = int( np.prod(kernel_dims)) * in_channels * filters_per_channel active_elements_count = batch_size * int(np.prod(output_dims)) overall_conv_flops = conv_per_position_flops * active_elements_count bias_flops = 0 if conv_module.bias is not None: bias_flops = out_channels * active_elements_count overall_flops = overall_conv_flops + bias_flops conv_module.__flops__ += int(overall_flops) def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None: batch_size = 1 if len(input) > 0: # Can have multiple inputs, getting the first one batch_size = len(input[0]) else: warnings.warn('No positional inputs found for a module, ' 'assuming batch size is 1.') module.__batch_counter__ += batch_size def add_batch_counter_variables_or_reset(module: nn.Module) -> None: module.__batch_counter__ = 0 def add_batch_counter_hook_function(module: nn.Module) -> None: if hasattr(module, '__batch_counter_handle__'): return handle = module.register_forward_hook(batch_counter_hook) module.__batch_counter_handle__ = handle def remove_batch_counter_hook_function(module: nn.Module) -> None: if hasattr(module, '__batch_counter_handle__'): module.__batch_counter_handle__.remove() del module.__batch_counter_handle__ def add_flops_counter_variable_or_reset(module: nn.Module) -> None: if is_supported_instance(module): if hasattr(module, '__flops__') or hasattr(module, '__params__'): warnings.warn('variables __flops__ or __params__ are already ' 'defined for the module' + type(module).__name__ + ' ptflops can affect your code!') module.__flops__ = 0 module.__params__ = get_model_parameters_number(module) def is_supported_instance(module: nn.Module) -> bool: if type(module) in get_modules_mapping(): return True return False def remove_flops_counter_hook_function(module: nn.Module) -> None: if is_supported_instance(module): if hasattr(module, '__flops_handle__'): module.__flops_handle__.remove() del module.__flops_handle__ def get_modules_mapping() -> Dict: return { # convolutions nn.Conv1d: conv_flops_counter_hook, nn.Conv2d: conv_flops_counter_hook, Conv2d: conv_flops_counter_hook, nn.Conv3d: conv_flops_counter_hook, Conv3d: conv_flops_counter_hook, # activations nn.ReLU: relu_flops_counter_hook, nn.PReLU: relu_flops_counter_hook, nn.ELU: relu_flops_counter_hook, nn.LeakyReLU: relu_flops_counter_hook, nn.ReLU6: relu_flops_counter_hook, # poolings nn.MaxPool1d: pool_flops_counter_hook, nn.AvgPool1d: pool_flops_counter_hook, nn.AvgPool2d: pool_flops_counter_hook, nn.MaxPool2d: pool_flops_counter_hook, MaxPool2d: pool_flops_counter_hook, nn.MaxPool3d: pool_flops_counter_hook, MaxPool3d: pool_flops_counter_hook, nn.AvgPool3d: pool_flops_counter_hook, nn.AdaptiveMaxPool1d: pool_flops_counter_hook, nn.AdaptiveAvgPool1d: pool_flops_counter_hook, nn.AdaptiveMaxPool2d: pool_flops_counter_hook, nn.AdaptiveAvgPool2d: pool_flops_counter_hook, nn.AdaptiveMaxPool3d: pool_flops_counter_hook, nn.AdaptiveAvgPool3d: pool_flops_counter_hook, # normalizations nn.BatchNorm1d: norm_flops_counter_hook, nn.BatchNorm2d: norm_flops_counter_hook, nn.BatchNorm3d: norm_flops_counter_hook, nn.GroupNorm: norm_flops_counter_hook, nn.InstanceNorm1d: norm_flops_counter_hook, nn.InstanceNorm2d: norm_flops_counter_hook, nn.InstanceNorm3d: norm_flops_counter_hook, nn.LayerNorm: norm_flops_counter_hook, # FC nn.Linear: linear_flops_counter_hook, Linear: linear_flops_counter_hook, # Upscale nn.Upsample: upsample_flops_counter_hook, # Deconvolution nn.ConvTranspose2d: deconv_flops_counter_hook, ConvTranspose2d: deconv_flops_counter_hook, } ================================================ FILE: mmcv/cnn/utils/fuse_conv_bn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module: """Fuse conv and bn into one module. Args: conv (nn.Module): Conv to be fused. bn (nn.Module): BN to be fused. Returns: nn.Module: Fused module. """ conv_w = conv.weight conv_b = conv.bias if conv.bias is not None else torch.zeros_like( bn.running_mean) factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1])) conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) return conv def fuse_conv_bn(module: nn.Module) -> nn.Module: """Recursively fuse conv and bn in a module. During inference, the functionary of batch norm layers is turned off but only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv layers to save computations and simplify network structures. Args: module (nn.Module): Module to be fused. Returns: nn.Module: Fused module. """ last_conv = None last_conv_name = None for name, child in module.named_children(): if isinstance(child, (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)): if last_conv is None: # only fuse BN that is after Conv continue fused_conv = _fuse_conv_bn(last_conv, child) module._modules[last_conv_name] = fused_conv # To reduce changes, set BN as Identity instead of deleting it. module._modules[name] = nn.Identity() last_conv = None elif isinstance(child, nn.Conv2d): last_conv = child last_conv_name = name else: fuse_conv_bn(child) return module ================================================ FILE: mmcv/cnn/vgg.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import logging from typing import List, Optional, Sequence, Tuple, Union import torch.nn as nn from mmengine.model import constant_init, kaiming_init, normal_init from mmengine.runner import load_checkpoint from torch import Tensor def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module: """3x3 convolution with padding.""" return nn.Conv2d( in_planes, out_planes, kernel_size=3, padding=dilation, dilation=dilation) def make_vgg_layer(inplanes: int, planes: int, num_blocks: int, dilation: int = 1, with_bn: bool = False, ceil_mode: bool = False) -> List[nn.Module]: layers = [] for _ in range(num_blocks): layers.append(conv3x3(inplanes, planes, dilation)) if with_bn: layers.append(nn.BatchNorm2d(planes)) layers.append(nn.ReLU(inplace=True)) inplanes = planes layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode)) return layers class VGG(nn.Module): """VGG backbone. Args: depth (int): Depth of vgg, from {11, 13, 16, 19}. with_bn (bool): Use BatchNorm or not. num_classes (int): number of classes for classification. num_stages (int): VGG stages, normally 5. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze running stats (mean and var). bn_frozen (bool): Whether to freeze weight and bias of BN layers. """ arch_settings = { 11: (1, 1, 2, 2, 2), 13: (2, 2, 2, 2, 2), 16: (2, 2, 3, 3, 3), 19: (2, 2, 4, 4, 4) } def __init__(self, depth: int, with_bn: bool = False, num_classes: int = -1, num_stages: int = 5, dilations: Sequence[int] = (1, 1, 1, 1, 1), out_indices: Sequence[int] = (0, 1, 2, 3, 4), frozen_stages: int = -1, bn_eval: bool = True, bn_frozen: bool = False, ceil_mode: bool = False, with_last_pool: bool = True): super().__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for vgg') assert num_stages >= 1 and num_stages <= 5 stage_blocks = self.arch_settings[depth] self.stage_blocks = stage_blocks[:num_stages] assert len(dilations) == num_stages assert max(out_indices) <= num_stages self.num_classes = num_classes self.out_indices = out_indices self.frozen_stages = frozen_stages self.bn_eval = bn_eval self.bn_frozen = bn_frozen self.inplanes = 3 start_idx = 0 vgg_layers = [] self.range_sub_modules = [] for i, num_blocks in enumerate(self.stage_blocks): num_modules = num_blocks * (2 + with_bn) + 1 end_idx = start_idx + num_modules dilation = dilations[i] planes = 64 * 2**i if i < 4 else 512 vgg_layer = make_vgg_layer( self.inplanes, planes, num_blocks, dilation=dilation, with_bn=with_bn, ceil_mode=ceil_mode) vgg_layers.extend(vgg_layer) self.inplanes = planes self.range_sub_modules.append([start_idx, end_idx]) start_idx = end_idx if not with_last_pool: vgg_layers.pop(-1) self.range_sub_modules[-1][1] -= 1 self.module_name = 'features' self.add_module(self.module_name, nn.Sequential(*vgg_layers)) if self.num_classes > 0: self.classifier = nn.Sequential( nn.Linear(512 * 7 * 7, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, num_classes), ) def init_weights(self, pretrained: Optional[str] = None) -> None: if isinstance(pretrained, str): logger = logging.getLogger() load_checkpoint(self, pretrained, strict=False, logger=logger) elif pretrained is None: for m in self.modules(): if isinstance(m, nn.Conv2d): kaiming_init(m) elif isinstance(m, nn.BatchNorm2d): constant_init(m, 1) elif isinstance(m, nn.Linear): normal_init(m, std=0.01) else: raise TypeError('pretrained must be a str or None') def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]: outs = [] vgg_layers = getattr(self, self.module_name) for i in range(len(self.stage_blocks)): for j in range(*self.range_sub_modules[i]): vgg_layer = vgg_layers[j] x = vgg_layer(x) if i in self.out_indices: outs.append(x) if self.num_classes > 0: x = x.view(x.size(0), -1) x = self.classifier(x) outs.append(x) if len(outs) == 1: return outs[0] else: return tuple(outs) def train(self, mode: bool = True) -> None: super().train(mode) if self.bn_eval: for m in self.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() if self.bn_frozen: for params in m.parameters(): params.requires_grad = False vgg_layers = getattr(self, self.module_name) if mode and self.frozen_stages >= 0: for i in range(self.frozen_stages): for j in range(*self.range_sub_modules[i]): mod = vgg_layers[j] mod.eval() for param in mod.parameters(): param.requires_grad = False ================================================ FILE: mmcv/image/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr, gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert, rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb) from .geometric import (cutout, imcrop, imflip, imflip_, impad, impad_to_multiple, imrescale, imresize, imresize_like, imresize_to_multiple, imrotate, imshear, imtranslate, rescale_size) from .io import imfrombytes, imread, imwrite, supported_backends, use_backend from .misc import tensor2imgs from .photometric import (adjust_brightness, adjust_color, adjust_contrast, adjust_hue, adjust_lighting, adjust_sharpness, auto_contrast, clahe, imdenormalize, imequalize, iminvert, imnormalize, imnormalize_, lut_transform, posterize, solarize) __all__ = [ 'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb', 'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale', 'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size', 'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate', 'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend', 'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize', 'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr', 'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize', 'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe', 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting', 'adjust_hue' ] ================================================ FILE: mmcv/image/colorspace.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Callable, Union import cv2 import numpy as np def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray: """Convert an image from the src colorspace to dst colorspace. Args: img (ndarray): The input image. src (str): The source colorspace, e.g., 'rgb', 'hsv'. dst (str): The destination colorspace, e.g., 'rgb', 'hsv'. Returns: ndarray: The converted image. """ code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') out_img = cv2.cvtColor(img, code) return out_img def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray: """Convert a BGR image to grayscale image. Args: img (ndarray): The input image. keepdim (bool): If False (by default), then return the grayscale image with 2 dims, otherwise 3 dims. Returns: ndarray: The converted grayscale image. """ out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) if keepdim: out_img = out_img[..., None] return out_img def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray: """Convert a RGB image to grayscale image. Args: img (ndarray): The input image. keepdim (bool): If False (by default), then return the grayscale image with 2 dims, otherwise 3 dims. Returns: ndarray: The converted grayscale image. """ out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) if keepdim: out_img = out_img[..., None] return out_img def gray2bgr(img: np.ndarray) -> np.ndarray: """Convert a grayscale image to BGR image. Args: img (ndarray): The input image. Returns: ndarray: The converted BGR image. """ img = img[..., None] if img.ndim == 2 else img out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) return out_img def gray2rgb(img: np.ndarray) -> np.ndarray: """Convert a grayscale image to RGB image. Args: img (ndarray): The input image. Returns: ndarray: The converted RGB image. """ img = img[..., None] if img.ndim == 2 else img out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) return out_img def _convert_input_type_range(img: np.ndarray) -> np.ndarray: """Convert the type and range of the input image. It converts the input image to np.float32 type and range of [0, 1]. It is mainly used for pre-processing the input image in colorspace conversion functions such as rgb2ycbcr and ycbcr2rgb. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: (ndarray): The converted image with type of np.float32 and range of [0, 1]. """ img_type = img.dtype img = img.astype(np.float32) if img_type == np.float32: pass elif img_type == np.uint8: img /= 255. else: raise TypeError('The img type should be np.float32 or np.uint8, ' f'but got {img_type}') return img def _convert_output_type_range( img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray: """Convert the type and range of the image according to dst_type. It converts the image to desired type and range. If `dst_type` is np.uint8, images will be converted to np.uint8 type with range [0, 255]. If `dst_type` is np.float32, it converts the image to np.float32 type with range [0, 1]. It is mainly used for post-processing images in colorspace conversion functions such as rgb2ycbcr and ycbcr2rgb. Args: img (ndarray): The image to be converted with np.float32 type and range [0, 255]. dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it converts the image to np.uint8 type with range [0, 255]. If dst_type is np.float32, it converts the image to np.float32 type with range [0, 1]. Returns: (ndarray): The converted image with desired type and range. """ if dst_type not in (np.uint8, np.float32): raise TypeError('The dst_type should be np.float32 or np.uint8, ' f'but got {dst_type}') if dst_type == np.uint8: img = img.round() else: img /= 255. return img.astype(dst_type) def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray: """Convert a RGB image to YCbCr image. This function produces the same results as Matlab's `rgb2ycbcr` function. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. y_only (bool): Whether to only return Y channel. Default: False. Returns: ndarray: The converted YCbCr image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) if y_only: out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0 else: out_img = np.matmul( img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786], [24.966, 112.0, -18.214]]) + [16, 128, 128] out_img = _convert_output_type_range(out_img, img_type) return out_img def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray: """Convert a BGR image to YCbCr image. The bgr version of rgb2ycbcr. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. y_only (bool): Whether to only return Y channel. Default: False. Returns: ndarray: The converted YCbCr image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) if y_only: out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0 else: out_img = np.matmul( img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786], [65.481, -37.797, 112.0]]) + [16, 128, 128] out_img = _convert_output_type_range(out_img, img_type) return out_img def ycbcr2rgb(img: np.ndarray) -> np.ndarray: """Convert a YCbCr image to RGB image. This function produces the same results as Matlab's ycbcr2rgb function. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: ndarray: The converted RGB image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) * 255 out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0, -0.00153632, 0.00791071], [0.00625893, -0.00318811, 0]]) * 255.0 + [ -222.921, 135.576, -276.836 ] out_img = _convert_output_type_range(out_img, img_type) return out_img def ycbcr2bgr(img: np.ndarray) -> np.ndarray: """Convert a YCbCr image to BGR image. The bgr version of ycbcr2rgb. It implements the ITU-R BT.601 conversion for standard-definition television. See more details in https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`. In OpenCV, it implements a JPEG conversion. See more details in https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. Args: img (ndarray): The input image. It accepts: 1. np.uint8 type with range [0, 255]; 2. np.float32 type with range [0, 1]. Returns: ndarray: The converted BGR image. The output image has the same type and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) * 255 out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], [0.00791071, -0.00153632, 0], [0, -0.00318811, 0.00625893]]) * 255.0 + [ -276.836, 135.576, -222.921 ] out_img = _convert_output_type_range(out_img, img_type) return out_img def convert_color_factory(src: str, dst: str) -> Callable: code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') def convert_color(img: np.ndarray) -> np.ndarray: out_img = cv2.cvtColor(img, code) return out_img convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()} image. Args: img (ndarray or str): The input image. Returns: ndarray: The converted {dst.upper()} image. """ return convert_color bgr2rgb = convert_color_factory('bgr', 'rgb') rgb2bgr = convert_color_factory('rgb', 'bgr') bgr2hsv = convert_color_factory('bgr', 'hsv') hsv2bgr = convert_color_factory('hsv', 'bgr') bgr2hls = convert_color_factory('bgr', 'hls') hls2bgr = convert_color_factory('hls', 'bgr') ================================================ FILE: mmcv/image/geometric.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numbers from typing import List, Optional, Tuple, Union, no_type_check import cv2 import numpy as np from mmengine.utils import to_2tuple from .io import imread_backend try: from PIL import Image except ImportError: Image = None def _scale_size( size: Tuple[int, int], scale: Union[float, int, Tuple[float, float], Tuple[int, int]], ) -> Tuple[int, int]: """Rescale a size by a ratio. Args: size (tuple[int]): (w, h). scale (float | int | tuple(float) | tuple(int)): Scaling factor. Returns: tuple[int]: scaled size. """ if isinstance(scale, (float, int)): scale = (scale, scale) w, h = size return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5) cv2_interp_codes = { 'nearest': cv2.INTER_NEAREST, 'bilinear': cv2.INTER_LINEAR, 'bicubic': cv2.INTER_CUBIC, 'area': cv2.INTER_AREA, 'lanczos': cv2.INTER_LANCZOS4 } cv2_border_modes = { 'constant': cv2.BORDER_CONSTANT, 'replicate': cv2.BORDER_REPLICATE, 'reflect': cv2.BORDER_REFLECT, 'wrap': cv2.BORDER_WRAP, 'reflect_101': cv2.BORDER_REFLECT_101, 'transparent': cv2.BORDER_TRANSPARENT, 'isolated': cv2.BORDER_ISOLATED } # Pillow >=v9.1.0 use a slightly different naming scheme for filters. # Set pillow_interp_codes according to the naming scheme used. if Image is not None: if hasattr(Image, 'Resampling'): pillow_interp_codes = { 'nearest': Image.Resampling.NEAREST, 'bilinear': Image.Resampling.BILINEAR, 'bicubic': Image.Resampling.BICUBIC, 'box': Image.Resampling.BOX, 'lanczos': Image.Resampling.LANCZOS, 'hamming': Image.Resampling.HAMMING } else: pillow_interp_codes = { 'nearest': Image.NEAREST, 'bilinear': Image.BILINEAR, 'bicubic': Image.BICUBIC, 'box': Image.BOX, 'lanczos': Image.LANCZOS, 'hamming': Image.HAMMING } def imresize( img: np.ndarray, size: Tuple[int, int], return_scale: bool = False, interpolation: str = 'bilinear', out: Optional[np.ndarray] = None, backend: Optional[str] = None ) -> Union[Tuple[np.ndarray, float, float], np.ndarray]: """Resize image to a given size. Args: img (ndarray): The input image. size (tuple[int]): Target size (w, h). return_scale (bool): Whether to return `w_scale` and `h_scale`. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. out (ndarray): The output destination. backend (str | None): The image resize backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. Returns: tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or `resized_img`. """ h, w = img.shape[:2] if backend is None: backend = imread_backend if backend not in ['cv2', 'pillow']: raise ValueError(f'backend: {backend} is not supported for resize.' f"Supported backends are 'cv2', 'pillow'") if backend == 'pillow': assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' pil_image = Image.fromarray(img) pil_image = pil_image.resize(size, pillow_interp_codes[interpolation]) resized_img = np.array(pil_image) else: resized_img = cv2.resize( img, size, dst=out, interpolation=cv2_interp_codes[interpolation]) if not return_scale: return resized_img else: w_scale = size[0] / w h_scale = size[1] / h return resized_img, w_scale, h_scale @no_type_check def imresize_to_multiple( img: np.ndarray, divisor: Union[int, Tuple[int, int]], size: Union[int, Tuple[int, int], None] = None, scale_factor: Union[float, int, Tuple[float, float], Tuple[int, int], None] = None, keep_ratio: bool = False, return_scale: bool = False, interpolation: str = 'bilinear', out: Optional[np.ndarray] = None, backend: Optional[str] = None ) -> Union[Tuple[np.ndarray, float, float], np.ndarray]: """Resize image according to a given size or scale factor and then rounds up the the resized or rescaled image size to the nearest value that can be divided by the divisor. Args: img (ndarray): The input image. divisor (int | tuple): Resized image size will be a multiple of divisor. If divisor is a tuple, divisor should be (w_divisor, h_divisor). size (None | int | tuple[int]): Target size (w, h). Default: None. scale_factor (None | float | int | tuple[float] | tuple[int]): Multiplier for spatial size. Should match input size if it is a tuple and the 2D style is (w_scale_factor, h_scale_factor). Default: None. keep_ratio (bool): Whether to keep the aspect ratio when resizing the image. Default: False. return_scale (bool): Whether to return `w_scale` and `h_scale`. interpolation (str): Interpolation method, accepted values are "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' backend, "nearest", "bilinear" for 'pillow' backend. out (ndarray): The output destination. backend (str | None): The image resize backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. Returns: tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or `resized_img`. """ h, w = img.shape[:2] if size is not None and scale_factor is not None: raise ValueError('only one of size or scale_factor should be defined') elif size is None and scale_factor is None: raise ValueError('one of size or scale_factor should be defined') elif size is not None: size = to_2tuple(size) if keep_ratio: size = rescale_size((w, h), size, return_scale=False) else: size = _scale_size((w, h), scale_factor) divisor = to_2tuple(divisor) size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor)) resized_img, w_scale, h_scale = imresize( img, size, return_scale=True, interpolation=interpolation, out=out, backend=backend) if return_scale: return resized_img, w_scale, h_scale else: return resized_img def imresize_like( img: np.ndarray, dst_img: np.ndarray, return_scale: bool = False, interpolation: str = 'bilinear', backend: Optional[str] = None ) -> Union[Tuple[np.ndarray, float, float], np.ndarray]: """Resize image to the same size of a given image. Args: img (ndarray): The input image. dst_img (ndarray): The target image. return_scale (bool): Whether to return `w_scale` and `h_scale`. interpolation (str): Same as :func:`resize`. backend (str | None): Same as :func:`resize`. Returns: tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or `resized_img`. """ h, w = dst_img.shape[:2] return imresize(img, (w, h), return_scale, interpolation, backend=backend) def rescale_size(old_size: tuple, scale: Union[float, int, Tuple[int, int]], return_scale: bool = False) -> tuple: """Calculate the new size to be rescaled to. Args: old_size (tuple[int]): The old size (w, h) of image. scale (float | int | tuple[int]): The scaling factor or maximum size. If it is a float number or an integer, then the image will be rescaled by this factor, else if it is a tuple of 2 integers, then the image will be rescaled as large as possible within the scale. return_scale (bool): Whether to return the scaling factor besides the rescaled image size. Returns: tuple[int]: The new rescaled image size. """ w, h = old_size if isinstance(scale, (float, int)): if scale <= 0: raise ValueError(f'Invalid scale {scale}, must be positive.') scale_factor = scale elif isinstance(scale, tuple): max_long_edge = max(scale) max_short_edge = min(scale) scale_factor = min(max_long_edge / max(h, w), max_short_edge / min(h, w)) else: raise TypeError( f'Scale must be a number or tuple of int, but got {type(scale)}') new_size = _scale_size((w, h), scale_factor) if return_scale: return new_size, scale_factor else: return new_size def imrescale( img: np.ndarray, scale: Union[float, int, Tuple[int, int]], return_scale: bool = False, interpolation: str = 'bilinear', backend: Optional[str] = None ) -> Union[np.ndarray, Tuple[np.ndarray, float]]: """Resize image while keeping the aspect ratio. Args: img (ndarray): The input image. scale (float | int | tuple[int]): The scaling factor or maximum size. If it is a float number or an integer, then the image will be rescaled by this factor, else if it is a tuple of 2 integers, then the image will be rescaled as large as possible within the scale. return_scale (bool): Whether to return the scaling factor besides the rescaled image. interpolation (str): Same as :func:`resize`. backend (str | None): Same as :func:`resize`. Returns: ndarray: The rescaled image. """ h, w = img.shape[:2] new_size, scale_factor = rescale_size((w, h), scale, return_scale=True) rescaled_img = imresize( img, new_size, interpolation=interpolation, backend=backend) if return_scale: return rescaled_img, scale_factor else: return rescaled_img def imflip(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray: """Flip an image horizontally or vertically. Args: img (ndarray): Image to be flipped. direction (str): The flip direction, either "horizontal" or "vertical" or "diagonal". Returns: ndarray: The flipped image. """ assert direction in ['horizontal', 'vertical', 'diagonal'] if direction == 'horizontal': return np.flip(img, axis=1) elif direction == 'vertical': return np.flip(img, axis=0) else: return np.flip(img, axis=(0, 1)) def imflip_(img: np.ndarray, direction: str = 'horizontal') -> np.ndarray: """Inplace flip an image horizontally or vertically. Args: img (ndarray): Image to be flipped. direction (str): The flip direction, either "horizontal" or "vertical" or "diagonal". Returns: ndarray: The flipped image (inplace). """ assert direction in ['horizontal', 'vertical', 'diagonal'] if direction == 'horizontal': return cv2.flip(img, 1, img) elif direction == 'vertical': return cv2.flip(img, 0, img) else: return cv2.flip(img, -1, img) def imrotate(img: np.ndarray, angle: float, center: Optional[Tuple[float, float]] = None, scale: float = 1.0, border_value: int = 0, interpolation: str = 'bilinear', auto_bound: bool = False, border_mode: str = 'constant') -> np.ndarray: """Rotate an image. Args: img (np.ndarray): Image to be rotated. angle (float): Rotation angle in degrees, positive values mean clockwise rotation. center (tuple[float], optional): Center point (w, h) of the rotation in the source image. If not specified, the center of the image will be used. scale (float): Isotropic scale factor. border_value (int): Border value used in case of a constant border. Defaults to 0. interpolation (str): Same as :func:`resize`. auto_bound (bool): Whether to adjust the image size to cover the whole rotated image. border_mode (str): Pixel extrapolation method. Defaults to 'constant'. Returns: np.ndarray: The rotated image. """ if center is not None and auto_bound: raise ValueError('`auto_bound` conflicts with `center`') h, w = img.shape[:2] if center is None: center = ((w - 1) * 0.5, (h - 1) * 0.5) assert isinstance(center, tuple) matrix = cv2.getRotationMatrix2D(center, -angle, scale) if auto_bound: cos = np.abs(matrix[0, 0]) sin = np.abs(matrix[0, 1]) new_w = h * sin + w * cos new_h = h * cos + w * sin matrix[0, 2] += (new_w - w) * 0.5 matrix[1, 2] += (new_h - h) * 0.5 w = int(np.round(new_w)) h = int(np.round(new_h)) rotated = cv2.warpAffine( img, matrix, (w, h), flags=cv2_interp_codes[interpolation], borderMode=cv2_border_modes[border_mode], borderValue=border_value) return rotated def bbox_clip(bboxes: np.ndarray, img_shape: Tuple[int, int]) -> np.ndarray: """Clip bboxes to fit the image shape. Args: bboxes (ndarray): Shape (..., 4*k) img_shape (tuple[int]): (height, width) of the image. Returns: ndarray: Clipped bboxes. """ assert bboxes.shape[-1] % 4 == 0 cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype) cmin[0::2] = img_shape[1] - 1 cmin[1::2] = img_shape[0] - 1 clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0) return clipped_bboxes def bbox_scaling(bboxes: np.ndarray, scale: float, clip_shape: Optional[Tuple[int, int]] = None) -> np.ndarray: """Scaling bboxes w.r.t the box center. Args: bboxes (ndarray): Shape(..., 4). scale (float): Scaling factor. clip_shape (tuple[int], optional): If specified, bboxes that exceed the boundary will be clipped according to the given shape (h, w). Returns: ndarray: Scaled bboxes. """ if float(scale) == 1.0: scaled_bboxes = bboxes.copy() else: w = bboxes[..., 2] - bboxes[..., 0] + 1 h = bboxes[..., 3] - bboxes[..., 1] + 1 dw = (w * (scale - 1)) * 0.5 dh = (h * (scale - 1)) * 0.5 scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1) if clip_shape is not None: return bbox_clip(scaled_bboxes, clip_shape) else: return scaled_bboxes def imcrop( img: np.ndarray, bboxes: np.ndarray, scale: float = 1.0, pad_fill: Union[float, list, None] = None ) -> Union[np.ndarray, List[np.ndarray]]: """Crop image patches. 3 steps: scale the bboxes -> clip bboxes -> crop and pad. Args: img (ndarray): Image to be cropped. bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes. scale (float, optional): Scale ratio of bboxes, the default value 1.0 means no scaling. pad_fill (Number | list[Number]): Value to be filled for padding. Default: None, which means no padding. Returns: list[ndarray] | ndarray: The cropped image patches. """ chn = 1 if img.ndim == 2 else img.shape[2] if pad_fill is not None: if isinstance(pad_fill, (int, float)): pad_fill = [pad_fill for _ in range(chn)] assert len(pad_fill) == chn _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32) clipped_bbox = bbox_clip(scaled_bboxes, img.shape) patches = [] for i in range(clipped_bbox.shape[0]): x1, y1, x2, y2 = tuple(clipped_bbox[i, :]) if pad_fill is None: patch = img[y1:y2 + 1, x1:x2 + 1, ...] else: _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :]) patch_h = _y2 - _y1 + 1 patch_w = _x2 - _x1 + 1 if chn == 1: patch_shape = (patch_h, patch_w) else: patch_shape = (patch_h, patch_w, chn) # type: ignore patch = np.array( pad_fill, dtype=img.dtype) * np.ones( patch_shape, dtype=img.dtype) x_start = 0 if _x1 >= 0 else -_x1 y_start = 0 if _y1 >= 0 else -_y1 w = x2 - x1 + 1 h = y2 - y1 + 1 patch[y_start:y_start + h, x_start:x_start + w, ...] = img[y1:y1 + h, x1:x1 + w, ...] patches.append(patch) if bboxes.ndim == 1: return patches[0] else: return patches def impad(img: np.ndarray, *, shape: Optional[Tuple[int, int]] = None, padding: Union[int, tuple, None] = None, pad_val: Union[float, List] = 0, padding_mode: str = 'constant') -> np.ndarray: """Pad the given image to a certain shape or pad on all sides with specified padding mode and padding value. Args: img (ndarray): Image to be padded. shape (tuple[int]): Expected padding shape (h, w). Default: None. padding (int or tuple[int]): Padding on each border. If a single int is provided this is used to pad all borders. If tuple of length 2 is provided this is the padding on left/right and top/bottom respectively. If a tuple of length 4 is provided this is the padding for the left, top, right and bottom borders respectively. Default: None. Note that `shape` and `padding` can not be both set. pad_val (Number | Sequence[Number]): Values to be filled in padding areas when padding_mode is 'constant'. Default: 0. padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default: constant. - constant: pads with a constant value, this value is specified with pad_val. - edge: pads with the last value at the edge of the image. - reflect: pads with reflection of image without repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode will result in [3, 2, 1, 2, 3, 4, 3, 2]. - symmetric: pads with reflection of image repeating the last value on the edge. For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode will result in [2, 1, 1, 2, 3, 4, 4, 3] Returns: ndarray: The padded image. """ assert (shape is not None) ^ (padding is not None) if shape is not None: width = max(shape[1] - img.shape[1], 0) height = max(shape[0] - img.shape[0], 0) padding = (0, 0, width, height) # check pad_val if isinstance(pad_val, tuple): assert len(pad_val) == img.shape[-1] elif not isinstance(pad_val, numbers.Number): raise TypeError('pad_val must be a int or a tuple. ' f'But received {type(pad_val)}') # check padding if isinstance(padding, tuple) and len(padding) in [2, 4]: if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) elif isinstance(padding, numbers.Number): padding = (padding, padding, padding, padding) else: raise ValueError('Padding must be a int or a 2, or 4 element tuple.' f'But received {padding}') # check padding mode assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] border_type = { 'constant': cv2.BORDER_CONSTANT, 'edge': cv2.BORDER_REPLICATE, 'reflect': cv2.BORDER_REFLECT_101, 'symmetric': cv2.BORDER_REFLECT } img = cv2.copyMakeBorder( img, padding[1], padding[3], padding[0], padding[2], border_type[padding_mode], value=pad_val) return img def impad_to_multiple(img: np.ndarray, divisor: int, pad_val: Union[float, List] = 0) -> np.ndarray: """Pad an image to ensure each edge to be multiple to some number. Args: img (ndarray): Image to be padded. divisor (int): Padded image edges will be multiple to divisor. pad_val (Number | Sequence[Number]): Same as :func:`impad`. Returns: ndarray: The padded image. """ pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor return impad(img, shape=(pad_h, pad_w), pad_val=pad_val) def cutout(img: np.ndarray, shape: Union[int, Tuple[int, int]], pad_val: Union[int, float, tuple] = 0) -> np.ndarray: """Randomly cut out a rectangle from the original img. Args: img (ndarray): Image to be cutout. shape (int | tuple[int]): Expected cutout shape (h, w). If given as a int, the value will be used for both h and w. pad_val (int | float | tuple[int | float]): Values to be filled in the cut area. Defaults to 0. Returns: ndarray: The cutout image. """ channels = 1 if img.ndim == 2 else img.shape[2] if isinstance(shape, int): cut_h, cut_w = shape, shape else: assert isinstance(shape, tuple) and len(shape) == 2, \ f'shape must be a int or a tuple with length 2, but got type ' \ f'{type(shape)} instead.' cut_h, cut_w = shape if isinstance(pad_val, (int, float)): pad_val = tuple([pad_val] * channels) elif isinstance(pad_val, tuple): assert len(pad_val) == channels, \ 'Expected the num of elements in tuple equals the channels' \ 'of input image. Found {} vs {}'.format( len(pad_val), channels) else: raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`') img_h, img_w = img.shape[:2] y0 = np.random.uniform(img_h) x0 = np.random.uniform(img_w) y1 = int(max(0, y0 - cut_h / 2.)) x1 = int(max(0, x0 - cut_w / 2.)) y2 = min(img_h, y1 + cut_h) x2 = min(img_w, x1 + cut_w) if img.ndim == 2: patch_shape = (y2 - y1, x2 - x1) else: patch_shape = (y2 - y1, x2 - x1, channels) # type: ignore img_cutout = img.copy() patch = np.array( pad_val, dtype=img.dtype) * np.ones( patch_shape, dtype=img.dtype) img_cutout[y1:y2, x1:x2, ...] = patch return img_cutout def _get_shear_matrix(magnitude: Union[int, float], direction: str = 'horizontal') -> np.ndarray: """Generate the shear matrix for transformation. Args: magnitude (int | float): The magnitude used for shear. direction (str): The flip direction, either "horizontal" or "vertical". Returns: ndarray: The shear matrix with dtype float32. """ if direction == 'horizontal': shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]]) elif direction == 'vertical': shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]]) return shear_matrix def imshear(img: np.ndarray, magnitude: Union[int, float], direction: str = 'horizontal', border_value: Union[int, Tuple[int, int]] = 0, interpolation: str = 'bilinear') -> np.ndarray: """Shear an image. Args: img (ndarray): Image to be sheared with format (h, w) or (h, w, c). magnitude (int | float): The magnitude used for shear. direction (str): The flip direction, either "horizontal" or "vertical". border_value (int | tuple[int]): Value used in case of a constant border. interpolation (str): Same as :func:`resize`. Returns: ndarray: The sheared image. """ assert direction in ['horizontal', 'vertical'], f'Invalid direction: {direction}' height, width = img.shape[:2] if img.ndim == 2: channels = 1 elif img.ndim == 3: channels = img.shape[-1] if isinstance(border_value, int): border_value = tuple([border_value] * channels) # type: ignore elif isinstance(border_value, tuple): assert len(border_value) == channels, \ 'Expected the num of elements in tuple equals the channels' \ 'of input image. Found {} vs {}'.format( len(border_value), channels) else: raise ValueError( f'Invalid type {type(border_value)} for `border_value`') shear_matrix = _get_shear_matrix(magnitude, direction) sheared = cv2.warpAffine( img, shear_matrix, (width, height), # Note case when the number elements in `border_value` # greater than 3 (e.g. shearing masks whose channels large # than 3) will raise TypeError in `cv2.warpAffine`. # Here simply slice the first 3 values in `border_value`. borderValue=border_value[:3], # type: ignore flags=cv2_interp_codes[interpolation]) return sheared def _get_translate_matrix(offset: Union[int, float], direction: str = 'horizontal') -> np.ndarray: """Generate the translate matrix. Args: offset (int | float): The offset used for translate. direction (str): The translate direction, either "horizontal" or "vertical". Returns: ndarray: The translate matrix with dtype float32. """ if direction == 'horizontal': translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]]) elif direction == 'vertical': translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]]) return translate_matrix def imtranslate(img: np.ndarray, offset: Union[int, float], direction: str = 'horizontal', border_value: Union[int, tuple] = 0, interpolation: str = 'bilinear') -> np.ndarray: """Translate an image. Args: img (ndarray): Image to be translated with format (h, w) or (h, w, c). offset (int | float): The offset used for translate. direction (str): The translate direction, either "horizontal" or "vertical". border_value (int | tuple[int]): Value used in case of a constant border. interpolation (str): Same as :func:`resize`. Returns: ndarray: The translated image. """ assert direction in ['horizontal', 'vertical'], f'Invalid direction: {direction}' height, width = img.shape[:2] if img.ndim == 2: channels = 1 elif img.ndim == 3: channels = img.shape[-1] if isinstance(border_value, int): border_value = tuple([border_value] * channels) elif isinstance(border_value, tuple): assert len(border_value) == channels, \ 'Expected the num of elements in tuple equals the channels' \ 'of input image. Found {} vs {}'.format( len(border_value), channels) else: raise ValueError( f'Invalid type {type(border_value)} for `border_value`.') translate_matrix = _get_translate_matrix(offset, direction) translated = cv2.warpAffine( img, translate_matrix, (width, height), # Note case when the number elements in `border_value` # greater than 3 (e.g. translating masks whose channels # large than 3) will raise TypeError in `cv2.warpAffine`. # Here simply slice the first 3 values in `border_value`. borderValue=border_value[:3], flags=cv2_interp_codes[interpolation]) return translated ================================================ FILE: mmcv/image/io.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import io import os.path as osp import warnings from pathlib import Path from typing import Optional, Union import cv2 import mmengine.fileio as fileio import numpy as np from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION, IMREAD_UNCHANGED) from mmengine.utils import is_filepath, is_str try: from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG except ImportError: TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None try: from PIL import Image, ImageOps except ImportError: Image = None try: import tifffile except ImportError: tifffile = None jpeg = None supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile'] imread_flags = { 'color': IMREAD_COLOR, 'grayscale': IMREAD_GRAYSCALE, 'unchanged': IMREAD_UNCHANGED, 'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR, 'grayscale_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE } imread_backend = 'cv2' def use_backend(backend: str) -> None: """Select a backend for image decoding. Args: backend (str): The image decoding backend type. Options are `cv2`, `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG) and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg` file format. """ assert backend in supported_backends global imread_backend imread_backend = backend if imread_backend == 'turbojpeg': if TurboJPEG is None: raise ImportError('`PyTurboJPEG` is not installed') global jpeg if jpeg is None: jpeg = TurboJPEG() elif imread_backend == 'pillow': if Image is None: raise ImportError('`Pillow` is not installed') elif imread_backend == 'tifffile': if tifffile is None: raise ImportError('`tifffile` is not installed') def _jpegflag(flag: str = 'color', channel_order: str = 'bgr'): channel_order = channel_order.lower() if channel_order not in ['rgb', 'bgr']: raise ValueError('channel order must be either "rgb" or "bgr"') if flag == 'color': if channel_order == 'bgr': return TJPF_BGR elif channel_order == 'rgb': return TJCS_RGB elif flag == 'grayscale': return TJPF_GRAY else: raise ValueError('flag must be "color" or "grayscale"') def _pillow2array(img, flag: str = 'color', channel_order: str = 'bgr') -> np.ndarray: """Convert a pillow image to numpy array. Args: img (:obj:`PIL.Image.Image`): The image loaded using PIL flag (str): Flags specifying the color type of a loaded image, candidates are 'color', 'grayscale' and 'unchanged'. Default to 'color'. channel_order (str): The channel order of the output image array, candidates are 'bgr' and 'rgb'. Default to 'bgr'. Returns: np.ndarray: The converted numpy array """ channel_order = channel_order.lower() if channel_order not in ['rgb', 'bgr']: raise ValueError('channel order must be either "rgb" or "bgr"') if flag == 'unchanged': array = np.array(img) if array.ndim >= 3 and array.shape[2] >= 3: # color image array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR else: # Handle exif orientation tag if flag in ['color', 'grayscale']: img = ImageOps.exif_transpose(img) # If the image mode is not 'RGB', convert it to 'RGB' first. if img.mode != 'RGB': if img.mode != 'LA': # Most formats except 'LA' can be directly converted to RGB img = img.convert('RGB') else: # When the mode is 'LA', the default conversion will fill in # the canvas with black, which sometimes shadows black objects # in the foreground. # # Therefore, a random color (124, 117, 104) is used for canvas img_rgba = img.convert('RGBA') img = Image.new('RGB', img_rgba.size, (124, 117, 104)) img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha if flag in ['color', 'color_ignore_orientation']: array = np.array(img) if channel_order != 'rgb': array = array[:, :, ::-1] # RGB to BGR elif flag in ['grayscale', 'grayscale_ignore_orientation']: img = img.convert('L') array = np.array(img) else: raise ValueError( 'flag must be "color", "grayscale", "unchanged", ' f'"color_ignore_orientation" or "grayscale_ignore_orientation"' f' but got {flag}') return array def imread(img_or_path: Union[np.ndarray, str, Path], flag: str = 'color', channel_order: str = 'bgr', backend: Optional[str] = None, file_client_args: Optional[dict] = None, *, backend_args: Optional[dict] = None) -> np.ndarray: """Read an image. Args: img_or_path (ndarray or str or Path): Either a numpy array or str or pathlib.Path. If it is a numpy array (loaded image), then it will be returned as is. flag (str): Flags specifying the color type of a loaded image, candidates are `color`, `grayscale`, `unchanged`, `color_ignore_orientation` and `grayscale_ignore_orientation`. By default, `cv2` and `pillow` backend would rotate the image according to its EXIF info unless called with `unchanged` or `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend always ignore image's EXIF info regardless of the flag. The `turbojpeg` backend only supports `color` and `grayscale`. channel_order (str): Order of channel, candidates are `bgr` and `rgb`. backend (str | None): The image decoding backend type. Options are `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmengine.fileio.FileClient` for details. Default: None. It will be deprecated in future. Please use ``backend_args`` instead. Deprecated in version 2.0.0rc4. backend_args (dict, optional): Instantiates the corresponding file backend. It may contain `backend` key to specify the file backend. If it contains, the file backend corresponding to this value will be used and initialized with the remaining values, otherwise the corresponding file backend will be selected based on the prefix of the file path. Defaults to None. New in version 2.0.0rc4. Returns: ndarray: Loaded image array. Examples: >>> import mmcv >>> img_path = '/path/to/img.jpg' >>> img = mmcv.imread(img_path) >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb', ... backend='cv2') >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr', ... backend='pillow') >>> s3_img_path = 's3://bucket/img.jpg' >>> # infer the file backend by the prefix s3 >>> img = mmcv.imread(s3_img_path) >>> # manually set the file backend petrel >>> img = mmcv.imread(s3_img_path, backend_args={ ... 'backend': 'petrel'}) >>> http_img_path = 'http://path/to/img.jpg' >>> img = mmcv.imread(http_img_path) >>> img = mmcv.imread(http_img_path, backend_args={ ... 'backend': 'http'}) """ if file_client_args is not None: warnings.warn( '"file_client_args" will be deprecated in future. ' 'Please use "backend_args" instead', DeprecationWarning) if backend_args is not None: raise ValueError( '"file_client_args" and "backend_args" cannot be set at the ' 'same time.') if isinstance(img_or_path, Path): img_or_path = str(img_or_path) if isinstance(img_or_path, np.ndarray): return img_or_path elif is_str(img_or_path): if file_client_args is not None: file_client = fileio.FileClient.infer_client( file_client_args, img_or_path) img_bytes = file_client.get(img_or_path) else: img_bytes = fileio.get(img_or_path, backend_args=backend_args) return imfrombytes(img_bytes, flag, channel_order, backend) else: raise TypeError('"img" must be a numpy array or a str or ' 'a pathlib.Path object') def imfrombytes(content: bytes, flag: str = 'color', channel_order: str = 'bgr', backend: Optional[str] = None) -> np.ndarray: """Read an image from bytes. Args: content (bytes): Image bytes got from files or other streams. flag (str): Same as :func:`imread`. channel_order (str): The channel order of the output, candidates are 'bgr' and 'rgb'. Default to 'bgr'. backend (str | None): The image decoding backend type. Options are `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. Returns: ndarray: Loaded image array. Examples: >>> img_path = '/path/to/img.jpg' >>> with open(img_path, 'rb') as f: >>> img_buff = f.read() >>> img = mmcv.imfrombytes(img_buff) >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb') >>> img = mmcv.imfrombytes(img_buff, backend='pillow') >>> img = mmcv.imfrombytes(img_buff, backend='cv2') """ if backend is None: backend = imread_backend if backend not in supported_backends: raise ValueError( f'backend: {backend} is not supported. Supported ' "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'") if backend == 'turbojpeg': img = jpeg.decode( # type: ignore content, _jpegflag(flag, channel_order)) if img.shape[-1] == 1: img = img[:, :, 0] return img elif backend == 'pillow': with io.BytesIO(content) as buff: img = Image.open(buff) img = _pillow2array(img, flag, channel_order) return img elif backend == 'tifffile': with io.BytesIO(content) as buff: img = tifffile.imread(buff) return img else: img_np = np.frombuffer(content, np.uint8) flag = imread_flags[flag] if is_str(flag) else flag img = cv2.imdecode(img_np, flag) if flag == IMREAD_COLOR and channel_order == 'rgb': cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) return img def imwrite(img: np.ndarray, file_path: str, params: Optional[list] = None, auto_mkdir: Optional[bool] = None, file_client_args: Optional[dict] = None, *, backend_args: Optional[dict] = None) -> bool: """Write image to file. Warning: The parameter `auto_mkdir` will be deprecated in the future and every file clients will make directory automatically. Args: img (ndarray): Image array to be written. file_path (str): Image file path. params (None or list): Same as opencv :func:`imwrite` interface. auto_mkdir (bool): If the parent folder of `file_path` does not exist, whether to create it automatically. It will be deprecated. file_client_args (dict, optional): Arguments to instantiate a FileClient. See :class:`mmengine.fileio.FileClient` for details. Default: None. It will be deprecated in future. Please use ``backend_args`` instead. Deprecated in version 2.0.0rc4. backend_args (dict, optional): Instantiates the corresponding file backend. It may contain `backend` key to specify the file backend. If it contains, the file backend corresponding to this value will be used and initialized with the remaining values, otherwise the corresponding file backend will be selected based on the prefix of the file path. Defaults to None. New in version 2.0.0rc4. Returns: bool: Successful or not. Examples: >>> # write to hard disk client >>> ret = mmcv.imwrite(img, '/path/to/img.jpg') >>> # infer the file backend by the prefix s3 >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg') >>> # manually set the file backend petrel >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', backend_args={ ... 'backend': 'petrel'}) """ if file_client_args is not None: warnings.warn( '"file_client_args" will be deprecated in future. ' 'Please use "backend_args" instead', DeprecationWarning) if backend_args is not None: raise ValueError( '"file_client_args" and "backend_args" cannot be set at the ' 'same time.') assert is_filepath(file_path) file_path = str(file_path) if auto_mkdir is not None: warnings.warn( 'The parameter `auto_mkdir` will be deprecated in the future and ' 'every file clients will make directory automatically.') img_ext = osp.splitext(file_path)[-1] # Encode image according to image suffix. # For example, if image path is '/path/your/img.jpg', the encode # format is '.jpg'. flag, img_buff = cv2.imencode(img_ext, img, params) if file_client_args is not None: file_client = fileio.FileClient.infer_client(file_client_args, file_path) file_client.put(img_buff.tobytes(), file_path) else: fileio.put(img_buff.tobytes(), file_path, backend_args=backend_args) return flag ================================================ FILE: mmcv/image/misc.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Optional import numpy as np import mmcv try: import torch except ImportError: torch = None def tensor2imgs(tensor, mean: Optional[tuple] = None, std: Optional[tuple] = None, to_rgb: bool = True) -> list: """Convert tensor to 3-channel images or 1-channel gray images. Args: tensor (torch.Tensor): Tensor that contains multiple images, shape ( N, C, H, W). :math:`C` can be either 3 or 1. mean (tuple[float], optional): Mean of images. If None, (0, 0, 0) will be used for tensor with 3-channel, while (0, ) for tensor with 1-channel. Defaults to None. std (tuple[float], optional): Standard deviation of images. If None, (1, 1, 1) will be used for tensor with 3-channel, while (1, ) for tensor with 1-channel. Defaults to None. to_rgb (bool, optional): Whether the tensor was converted to RGB format in the first place. If so, convert it back to BGR. For the tensor with 1 channel, it must be False. Defaults to True. Returns: list[np.ndarray]: A list that contains multiple images. """ if torch is None: raise RuntimeError('pytorch is not installed') assert torch.is_tensor(tensor) and tensor.ndim == 4 channels = tensor.size(1) assert channels in [1, 3] if mean is None: mean = (0, ) * channels if std is None: std = (1, ) * channels assert (channels == len(mean) == len(std) == 3) or \ (channels == len(mean) == len(std) == 1 and not to_rgb) num_imgs = tensor.size(0) mean = np.array(mean, dtype=np.float32) std = np.array(std, dtype=np.float32) imgs = [] for img_id in range(num_imgs): img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) img = mmcv.imdenormalize( img, mean, std, to_bgr=to_rgb).astype(np.uint8) imgs.append(np.ascontiguousarray(img)) return imgs ================================================ FILE: mmcv/image/photometric.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from typing import Optional import cv2 import numpy as np from mmengine.utils import is_tuple_of from PIL import Image, ImageEnhance from .colorspace import bgr2gray, gray2bgr from .io import imread_backend def imnormalize(img, mean, std, to_rgb=True): """Normalize an image with mean and std. Args: img (ndarray): Image to be normalized. mean (ndarray): The mean to be used for normalize. std (ndarray): The std to be used for normalize. to_rgb (bool): Whether to convert to rgb. Returns: ndarray: The normalized image. """ img = img.copy().astype(np.float32) return imnormalize_(img, mean, std, to_rgb) def imnormalize_(img, mean, std, to_rgb=True): """Inplace normalize an image with mean and std. Args: img (ndarray): Image to be normalized. mean (ndarray): The mean to be used for normalize. std (ndarray): The std to be used for normalize. to_rgb (bool): Whether to convert to rgb. Returns: ndarray: The normalized image. """ # cv2 inplace normalization does not accept uint8 assert img.dtype != np.uint8 mean = np.float64(mean.reshape(1, -1)) stdinv = 1 / np.float64(std.reshape(1, -1)) if to_rgb: cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace cv2.subtract(img, mean, img) # inplace cv2.multiply(img, stdinv, img) # inplace return img def imdenormalize(img, mean, std, to_bgr=True): assert img.dtype != np.uint8 mean = mean.reshape(1, -1).astype(np.float64) std = std.reshape(1, -1).astype(np.float64) img = cv2.multiply(img, std) # make a copy cv2.add(img, mean, img) # inplace if to_bgr: cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img) # inplace return img def iminvert(img): """Invert (negate) an image. Args: img (ndarray): Image to be inverted. Returns: ndarray: The inverted image. """ return np.full_like(img, 255) - img def solarize(img, thr=128): """Solarize an image (invert all pixel values above a threshold) Args: img (ndarray): Image to be solarized. thr (int): Threshold for solarizing (0 - 255). Returns: ndarray: The solarized image. """ img = np.where(img < thr, img, 255 - img) return img def posterize(img, bits): """Posterize an image (reduce the number of bits for each color channel) Args: img (ndarray): Image to be posterized. bits (int): Number of bits (1 to 8) to use for posterizing. Returns: ndarray: The posterized image. """ shift = 8 - bits img = np.left_shift(np.right_shift(img, shift), shift) return img def adjust_color(img, alpha=1, beta=None, gamma=0, backend=None): r"""It blends the source image and its gray image: .. math:: output = img * alpha + gray\_img * beta + gamma Args: img (ndarray): The input source image. alpha (int | float): Weight for the source image. Default 1. beta (int | float): Weight for the converted gray image. If None, it's assigned the value (1 - `alpha`). gamma (int | float): Scalar added to each sum. Same as :func:`cv2.addWeighted`. Default 0. backend (str | None): The image processing backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global ``imread_backend`` specified by ``mmcv.use_backend()`` will be used. Defaults to None. Returns: ndarray: Colored image which has the same size and dtype as input. """ if backend is None: backend = imread_backend if backend not in ['cv2', 'pillow']: raise ValueError(f'backend: {backend} is not supported.' f"Supported backends are 'cv2', 'pillow'") if backend == 'pillow': assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' warnings.warn("Only use 'alpha' for pillow backend.") # Image.fromarray defaultly supports RGB, not BGR. pil_image = Image.fromarray(img[..., ::-1], mode='RGB') enhancer = ImageEnhance.Color(pil_image) pil_image = enhancer.enhance(alpha) return np.array(pil_image, dtype=img.dtype)[..., ::-1] else: gray_img = bgr2gray(img) gray_img = np.tile(gray_img[..., None], [1, 1, 3]) if beta is None: beta = 1 - alpha colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma) if not colored_img.dtype == np.uint8: # Note when the dtype of `img` is not the default `np.uint8` # (e.g. np.float32), the value in `colored_img` got from cv2 # is not guaranteed to be in range [0, 255], so here clip # is needed. colored_img = np.clip(colored_img, 0, 255) return colored_img.astype(img.dtype) def imequalize(img): """Equalize the image histogram. This function applies a non-linear mapping to the input image, in order to create a uniform distribution of grayscale values in the output image. Args: img (ndarray): Image to be equalized. Returns: ndarray: The equalized image. """ def _scale_channel(im, c): """Scale the data in the corresponding channel.""" im = im[:, :, c] # Compute the histogram of the image channel. histo = np.histogram(im, 256, (0, 255))[0] # For computing the step, filter out the nonzeros. nonzero_histo = histo[histo > 0] step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255 if not step: lut = np.array(range(256)) else: # Compute the cumulative sum, shifted by step // 2 # and then normalized by step. lut = (np.cumsum(histo) + (step // 2)) // step # Shift lut, prepending with 0. lut = np.concatenate([[0], lut[:-1]], 0) # handle potential integer overflow lut[lut > 255] = 255 # If step is zero, return the original image. # Otherwise, index from lut. return np.where(np.equal(step, 0), im, lut[im]) # Scales each channel independently and then stacks # the result. s1 = _scale_channel(img, 0) s2 = _scale_channel(img, 1) s3 = _scale_channel(img, 2) equalized_img = np.stack([s1, s2, s3], axis=-1) return equalized_img.astype(img.dtype) def adjust_brightness(img, factor=1., backend=None): """Adjust image brightness. This function controls the brightness of an image. An enhancement factor of 0.0 gives a black image. A factor of 1.0 gives the original image. This function blends the source image and the degenerated black image: .. math:: output = img * factor + degenerated * (1 - factor) Args: img (ndarray): Image to be brightened. factor (float): A value controls the enhancement. Factor 1.0 returns the original image, lower factors mean less color (brightness, contrast, etc), and higher values more. Default 1. backend (str | None): The image processing backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global ``imread_backend`` specified by ``mmcv.use_backend()`` will be used. Defaults to None. Returns: ndarray: The brightened image. """ if backend is None: backend = imread_backend if backend not in ['cv2', 'pillow']: raise ValueError(f'backend: {backend} is not supported.' f"Supported backends are 'cv2', 'pillow'") if backend == 'pillow': assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' # Image.fromarray defaultly supports RGB, not BGR. pil_image = Image.fromarray(img[..., ::-1], mode='RGB') enhancer = ImageEnhance.Brightness(pil_image) pil_image = enhancer.enhance(factor) return np.array(pil_image, dtype=img.dtype)[..., ::-1] else: degenerated = np.zeros_like(img) # Note manually convert the dtype to np.float32, to # achieve as close results as PIL.ImageEnhance.Brightness. # Set beta=1-factor, and gamma=0 brightened_img = cv2.addWeighted( img.astype(np.float32), factor, degenerated.astype(np.float32), 1 - factor, 0) brightened_img = np.clip(brightened_img, 0, 255) return brightened_img.astype(img.dtype) def adjust_contrast(img, factor=1., backend=None): """Adjust image contrast. This function controls the contrast of an image. An enhancement factor of 0.0 gives a solid grey image. A factor of 1.0 gives the original image. It blends the source image and the degenerated mean image: .. math:: output = img * factor + degenerated * (1 - factor) Args: img (ndarray): Image to be contrasted. BGR order. factor (float): Same as :func:`mmcv.adjust_brightness`. backend (str | None): The image processing backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global ``imread_backend`` specified by ``mmcv.use_backend()`` will be used. Defaults to None. Returns: ndarray: The contrasted image. """ if backend is None: backend = imread_backend if backend not in ['cv2', 'pillow']: raise ValueError(f'backend: {backend} is not supported.' f"Supported backends are 'cv2', 'pillow'") if backend == 'pillow': assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' # Image.fromarray defaultly supports RGB, not BGR. pil_image = Image.fromarray(img[..., ::-1], mode='RGB') enhancer = ImageEnhance.Contrast(pil_image) pil_image = enhancer.enhance(factor) return np.array(pil_image, dtype=img.dtype)[..., ::-1] else: gray_img = bgr2gray(img) hist = np.histogram(gray_img, 256, (0, 255))[0] mean = round(np.sum(gray_img) / np.sum(hist)) degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype) degenerated = gray2bgr(degenerated) contrasted_img = cv2.addWeighted( img.astype(np.float32), factor, degenerated.astype(np.float32), 1 - factor, 0) contrasted_img = np.clip(contrasted_img, 0, 255) return contrasted_img.astype(img.dtype) def auto_contrast(img, cutoff=0): """Auto adjust image contrast. This function maximize (normalize) image contrast by first removing cutoff percent of the lightest and darkest pixels from the histogram and remapping the image so that the darkest pixel becomes black (0), and the lightest becomes white (255). Args: img (ndarray): Image to be contrasted. BGR order. cutoff (int | float | tuple): The cutoff percent of the lightest and darkest pixels to be removed. If given as tuple, it shall be (low, high). Otherwise, the single value will be used for both. Defaults to 0. Returns: ndarray: The contrasted image. """ def _auto_contrast_channel(im, c, cutoff): im = im[:, :, c] # Compute the histogram of the image channel. histo = np.histogram(im, 256, (0, 255))[0] # Remove cut-off percent pixels from histo histo_sum = np.cumsum(histo) cut_low = histo_sum[-1] * cutoff[0] // 100 cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100 histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0) # Compute mapping low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1] # If all the values have been cut off, return the origin img if low >= high: return im scale = 255.0 / (high - low) offset = -low * scale lut = np.array(range(256)) lut = lut * scale + offset lut = np.clip(lut, 0, 255) return lut[im] if isinstance(cutoff, (int, float)): cutoff = (cutoff, cutoff) else: assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \ f'float or tuple, but got {type(cutoff)} instead.' # Auto adjusts contrast for each channel independently and then stacks # the result. s1 = _auto_contrast_channel(img, 0, cutoff) s2 = _auto_contrast_channel(img, 1, cutoff) s3 = _auto_contrast_channel(img, 2, cutoff) contrasted_img = np.stack([s1, s2, s3], axis=-1) return contrasted_img.astype(img.dtype) def adjust_sharpness(img, factor=1., kernel=None): """Adjust image sharpness. This function controls the sharpness of an image. An enhancement factor of 0.0 gives a blurred image. A factor of 1.0 gives the original image. And a factor of 2.0 gives a sharpened image. It blends the source image and the degenerated mean image: .. math:: output = img * factor + degenerated * (1 - factor) Args: img (ndarray): Image to be sharpened. BGR order. factor (float): Same as :func:`mmcv.adjust_brightness`. kernel (np.ndarray, optional): Filter kernel to be applied on the img to obtain the degenerated img. Defaults to None. Note: No value sanity check is enforced on the kernel set by users. So with an inappropriate kernel, the ``adjust_sharpness`` may fail to perform the function its name indicates but end up performing whatever transform determined by the kernel. Returns: ndarray: The sharpened image. """ if kernel is None: # adopted from PIL.ImageFilter.SMOOTH kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13 assert isinstance(kernel, np.ndarray), \ f'kernel must be of type np.ndarray, but got {type(kernel)} instead.' assert kernel.ndim == 2, \ f'kernel must have a dimension of 2, but got {kernel.ndim} instead.' degenerated = cv2.filter2D(img, -1, kernel) sharpened_img = cv2.addWeighted( img.astype(np.float32), factor, degenerated.astype(np.float32), 1 - factor, 0) sharpened_img = np.clip(sharpened_img, 0, 255) return sharpened_img.astype(img.dtype) def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True): """AlexNet-style PCA jitter. This data augmentation is proposed in `ImageNet Classification with Deep Convolutional Neural Networks `_. Args: img (ndarray): Image to be adjusted lighting. BGR order. eigval (ndarray): the eigenvalue of the convariance matrix of pixel values, respectively. eigvec (ndarray): the eigenvector of the convariance matrix of pixel values, respectively. alphastd (float): The standard deviation for distribution of alpha. Defaults to 0.1 to_rgb (bool): Whether to convert img to rgb. Returns: ndarray: The adjusted image. """ assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \ f'eigval and eigvec should both be of type np.ndarray, got ' \ f'{type(eigval)} and {type(eigvec)} instead.' assert eigval.ndim == 1 and eigvec.ndim == 2 assert eigvec.shape == (3, eigval.shape[0]) n_eigval = eigval.shape[0] assert isinstance(alphastd, float), 'alphastd should be of type float, ' \ f'got {type(alphastd)} instead.' img = img.copy().astype(np.float32) if to_rgb: cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace alpha = np.random.normal(0, alphastd, n_eigval) alter = eigvec \ * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \ * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval)) alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape) img_adjusted = img + alter return img_adjusted def lut_transform(img, lut_table): """Transform array by look-up table. The function lut_transform fills the output array with values from the look-up table. Indices of the entries are taken from the input array. Args: img (ndarray): Image to be transformed. lut_table (ndarray): look-up table of 256 elements; in case of multi-channel input array, the table should either have a single channel (in this case the same table is used for all channels) or the same number of channels as in the input array. Returns: ndarray: The transformed image. """ assert isinstance(img, np.ndarray) assert 0 <= np.min(img) and np.max(img) <= 255 assert isinstance(lut_table, np.ndarray) assert lut_table.shape == (256, ) return cv2.LUT(np.array(img, dtype=np.uint8), lut_table) def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)): """Use CLAHE method to process the image. See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J]. Graphics Gems, 1994:474-485.` for more information. Args: img (ndarray): Image to be processed. clip_limit (float): Threshold for contrast limiting. Default: 40.0. tile_grid_size (tuple[int]): Size of grid for histogram equalization. Input image will be divided into equally sized rectangular tiles. It defines the number of tiles in row and column. Default: (8, 8). Returns: ndarray: The processed image. """ assert isinstance(img, np.ndarray) assert img.ndim == 2 assert isinstance(clip_limit, (float, int)) assert is_tuple_of(tile_grid_size, int) assert len(tile_grid_size) == 2 clahe = cv2.createCLAHE(clip_limit, tile_grid_size) return clahe.apply(np.array(img, dtype=np.uint8)) def adjust_hue(img: np.ndarray, hue_factor: float, backend: Optional[str] = None) -> np.ndarray: """Adjust hue of an image. The image hue is adjusted by converting the image to HSV and cyclically shifting the intensities in the hue channel (H). The image is then converted back to original image mode. `hue_factor` is the amount of shift in H channel and must be in the interval `[-0.5, 0.5]`. Modified from https://github.com/pytorch/vision/blob/main/torchvision/ transforms/functional.py Args: img (ndarray): Image to be adjusted. hue_factor (float): How much to shift the hue channel. Should be in [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in HSV space in positive and negative direction respectively. 0 means no shift. Therefore, both -0.5 and 0.5 will give an image with complementary colors while 0 gives the original image. backend (str | None): The image processing backend type. Options are `cv2`, `pillow`, `None`. If backend is None, the global ``imread_backend`` specified by ``mmcv.use_backend()`` will be used. Defaults to None. Returns: ndarray: Hue adjusted image. """ if backend is None: backend = imread_backend if backend not in ['cv2', 'pillow']: raise ValueError(f'backend: {backend} is not supported.' f"Supported backends are 'cv2', 'pillow'") if not (-0.5 <= hue_factor <= 0.5): raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].') if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})): raise TypeError('img should be ndarray with dim=[2 or 3].') if backend == 'pillow': assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' # Image.fromarray defaultly supports RGB, not BGR. pil_image = Image.fromarray(img[..., ::-1], mode='RGB') input_mode = pil_image.mode if input_mode in {'L', '1', 'I', 'F'}: return pil_image h, s, v = pil_image.convert('HSV').split() np_h = np.array(h, dtype=np.uint8) # uint8 addition take cares of rotation across boundaries with np.errstate(over='ignore'): np_h += np.uint8(hue_factor * 255) h = Image.fromarray(np_h, 'L') pil_image = Image.merge('HSV', (h, s, v)).convert(input_mode) return np.array(pil_image, dtype=img.dtype)[..., ::-1] else: dtype = img.dtype img = img.astype(np.uint8) hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL) h, s, v = cv2.split(hsv_img) h = h.astype(np.uint8) # uint8 addition take cares of rotation across boundaries with np.errstate(over='ignore'): h += np.uint8(hue_factor * 255) hsv_img = cv2.merge([h, s, v]) return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype) ================================================ FILE: mmcv/ops/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.utils import IS_MLU_AVAILABLE from .active_rotated_filter import active_rotated_filter from .assign_score_withk import assign_score_withk from .ball_query import ball_query from .bbox import bbox_overlaps from .bezier_align import BezierAlign, bezier_align from .bias_act import bias_act from .border_align import BorderAlign, border_align from .box_iou_quadri import box_iou_quadri from .box_iou_rotated import box_iou_rotated from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive from .cc_attention import CrissCrossAttention from .chamfer_distance import chamfer_distance from .contour_expand import contour_expand from .conv2d_gradfix import conv2d, conv_transpose2d from .convex_iou import convex_giou, convex_iou from .corner_pool import CornerPool from .correlation import Correlation from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack, ModulatedDeformRoIPoolPack, deform_roi_pool) from .deprecated_wrappers import Conv2d_deprecated as Conv2d from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d from .deprecated_wrappers import Linear_deprecated as Linear from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d from .filtered_lrelu import filtered_lrelu from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss, sigmoid_focal_loss, softmax_focal_loss) from .furthest_point_sample import (furthest_point_sample, furthest_point_sample_with_dist) from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu from .gather_points import gather_points from .group_points import GroupAll, QueryAndGroup, grouping_operation from .info import get_compiler_version, get_compiling_cuda_version from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d, nms3d_normal, nms_bev, nms_normal_bev) from .knn import knn from .masked_conv import MaskedConv2d, masked_conv2d from .min_area_polygons import min_area_polygons from .modulated_deform_conv import (ModulatedDeformConv2d, ModulatedDeformConv2dPack, modulated_deform_conv2d) from .multi_scale_deform_attn import MultiScaleDeformableAttention from .nms import batched_nms, nms, nms_match, nms_quadri, nms_rotated, soft_nms from .pixel_group import pixel_group from .point_sample import (SimpleRoIAlign, point_sample, rel_roi_point_to_rel_img_point) from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, points_in_boxes_part) from .points_in_polygons import points_in_polygons from .points_sampler import PointsSampler from .prroi_pool import PrRoIPool, prroi_pool from .psa_mask import PSAMask from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated from .roi_align import RoIAlign, roi_align from .roi_align_rotated import RoIAlignRotated, roi_align_rotated from .roi_pool import RoIPool, roi_pool from .roiaware_pool3d import RoIAwarePool3d from .roipoint_pool3d import RoIPointPool3d from .rotated_feature_align import rotated_feature_align from .saconv import SAConv2d from .scatter_points import DynamicScatter, dynamic_scatter from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d, SparseConvTranspose3d, SparseInverseConv2d, SparseInverseConv3d, SubMConv2d, SubMConv3d) from .sparse_modules import SparseModule, SparseSequential from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d from .sparse_structure import SparseConvTensor, scatter_nd from .sync_bn import SyncBatchNorm from .three_interpolate import three_interpolate from .three_nn import three_nn from .tin_shift import TINShift, tin_shift from .upfirdn2d import filter2d, upfirdn2d, upsample2d from .voxelize import Voxelization, voxelization __all__ = [ 'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe', 'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack', 'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack', 'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss', 'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss', 'get_compiler_version', 'get_compiling_cuda_version', 'MaskedConv2d', 'masked_conv2d', 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack', 'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d', 'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask', 'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign', 'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk', 'box_iou_rotated', 'box_iou_quadri', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu', 'rotated_feature_align', 'RiRoIAlignRotated', 'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn', 'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign', 'border_align', 'gather_points', 'furthest_point_sample', 'nms_quadri', 'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation', 'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev', 'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d', 'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d', 'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d', 'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d', 'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons', 'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou', 'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance', 'PrRoIPool', 'prroi_pool', 'bias_act', 'filtered_lrelu', 'conv2d', 'conv_transpose2d', 'filter2d', 'upsample2d', 'BezierAlign', 'bezier_align' ] if IS_MLU_AVAILABLE: from .deform_conv import DeformConv2dPack_MLU # noqa:F401 from .modulated_deform_conv import \ ModulatedDeformConv2dPack_MLU # noqa:F401 __all__.extend(['ModulatedDeformConv2dPack_MLU', 'DeformConv2dPack_MLU']) ================================================ FILE: mmcv/ops/active_rotated_filter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple import torch from torch.autograd import Function from torch.autograd.function import once_differentiable from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['active_rotated_filter_forward', 'active_rotated_filter_backward']) class ActiveRotatedFilterFunction(Function): """Encoding the orientation information and generating orientation- sensitive features. The details are described in the paper `Align Deep Features for Oriented Object Detection _`. """ # noqa: E501 @staticmethod def forward(ctx, input: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: """ Args: input (torch.Tensor): Input features with shape [num_output_planes, num_input_planes, num_orientations, H, W]. indices (torch.Tensor): Indices with shape [num_orientations, H, W, num_rotations]. Returns: torch.Tensor: Refined features with shape [num_output_planes * num_rotations, num_input_planes * num_orientations, H, W]. """ ctx.save_for_backward(input, indices) op, ip, o, h, w = input.size() o, h, w, r = indices.size() output = input.new_zeros((op * r, ip * o, h, w)) ext_module.active_rotated_filter_forward(input, indices, output) return output @staticmethod @once_differentiable def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]: """ Args: grad_output (torch.Tensor): The gradient of output features with shape [num_output_planes * num_rotations, num_input_planes * num_orientations, H, W]. Returns: torch.Tensor: The gradient of input features with shape [num_output_planes, num_input_planes, num_orientations, H, W]. """ input, indices = ctx.saved_tensors grad_in = torch.zeros_like(input) ext_module.active_rotated_filter_backward(grad_out, indices, grad_in) return grad_in, None active_rotated_filter = ActiveRotatedFilterFunction.apply ================================================ FILE: mmcv/ops/assign_score_withk.py ================================================ from typing import Tuple import torch from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward']) class AssignScoreWithK(Function): r"""Perform weighted sum to generate output features according to scores. Modified from `PAConv `_. This is a memory-efficient CUDA implementation of assign_scores operation, which first transform all point features with weight bank, then assemble neighbor features with ``knn_idx`` and perform weighted sum of ``scores``. See the `paper `_ appendix Sec. D for more detailed descriptions. Note: This implementation assumes using ``neighbor`` kernel input, which is (point_features - center_features, point_features). See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/ pointnet2/paconv.py#L128 for more details. """ @staticmethod def forward(ctx, scores: torch.Tensor, point_features: torch.Tensor, center_features: torch.Tensor, knn_idx: torch.Tensor, aggregate: str = 'sum') -> torch.Tensor: """ Args: scores (torch.Tensor): (B, npoint, K, M), predicted scores to aggregate weight matrices in the weight bank. ``npoint`` is the number of sampled centers. ``K`` is the number of queried neighbors. ``M`` is the number of weight matrices in the weight bank. point_features (torch.Tensor): (B, N, M, out_dim) Pre-computed point features to be aggregated. center_features (torch.Tensor): (B, N, M, out_dim) Pre-computed center features to be aggregated. knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN. We assume the first idx in each row is the idx of the center. aggregate (str, optional): Aggregation method. Can be 'sum', 'avg' or 'max'. Defaults: 'sum'. Returns: torch.Tensor: (B, out_dim, npoint, K), the aggregated features. """ agg = {'sum': 0, 'avg': 1, 'max': 2} B, N, M, out_dim = point_features.size() _, npoint, K, _ = scores.size() output = point_features.new_zeros((B, out_dim, npoint, K)) ext_module.assign_score_withk_forward( point_features.contiguous(), center_features.contiguous(), scores.contiguous(), knn_idx.contiguous(), output, B=B, N0=N, N1=npoint, M=M, K=K, O=out_dim, aggregate=agg[aggregate]) ctx.save_for_backward(output, point_features, center_features, scores, knn_idx) ctx.agg = agg[aggregate] return output @staticmethod def backward( ctx, grad_out: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]: """ Args: grad_out (torch.Tensor): (B, out_dim, npoint, K) Returns: tuple[torch.Tensor]: A tuple contains five elements. The first one is the gradient of ``scores`` whose shape is (B, npoint, K, M). The second is the gradient of ``point_features`` whose shape is (B, N, M, out_dim). The third is the gradient of ``center_features`` with the shape of (B, N, M, out_dim). The last two are ``None``. """ _, point_features, center_features, scores, knn_idx = ctx.saved_tensors agg = ctx.agg B, N, M, out_dim = point_features.size() _, npoint, K, _ = scores.size() grad_point_features = point_features.new_zeros(point_features.shape) grad_center_features = center_features.new_zeros(center_features.shape) grad_scores = scores.new_zeros(scores.shape) ext_module.assign_score_withk_backward( grad_out.contiguous(), point_features.contiguous(), center_features.contiguous(), scores.contiguous(), knn_idx.contiguous(), grad_point_features, grad_center_features, grad_scores, B=B, N0=N, N1=npoint, M=M, K=K, O=out_dim, aggregate=agg) return grad_scores, grad_point_features, \ grad_center_features, None, None assign_score_withk = AssignScoreWithK.apply ================================================ FILE: mmcv/ops/ball_query.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Optional, Tuple import torch from torch.autograd import Function from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['ball_query_forward', 'stack_ball_query_forward']) class BallQuery(Function): """Find nearby points in spherical space.""" @staticmethod def forward( ctx, min_radius: float, max_radius: float, sample_num: int, xyz: torch.Tensor, center_xyz: torch.Tensor, xyz_batch_cnt: Optional[torch.Tensor] = None, center_xyz_batch_cnt: Optional[torch.Tensor] = None ) -> torch.Tensor: """ Args: min_radius (float): minimum radius of the balls. max_radius (float): maximum radius of the balls. sample_num (int): maximum number of features in the balls. xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features, or staked input (N1 + N2 ..., 3). center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball query, or staked input (M1 + M2 ..., 3). xyz_batch_cnt: (batch_size): Stacked input xyz coordinates nums in each batch, just like (N1, N2, ...). Defaults to None. New in version 1.7.0. center_xyz_batch_cnt: (batch_size): Stacked centers coordinates nums in each batch, just line (M1, M2, ...). Defaults to None. New in version 1.7.0. Returns: torch.Tensor: (B, npoint, nsample) tensor with the indices of the features that form the query balls. """ assert center_xyz.is_contiguous() assert xyz.is_contiguous() assert min_radius < max_radius if xyz_batch_cnt is not None and center_xyz_batch_cnt is not None: assert xyz_batch_cnt.dtype == torch.int assert center_xyz_batch_cnt.dtype == torch.int idx = center_xyz.new_zeros((center_xyz.shape[0], sample_num), dtype=torch.int32) ext_module.stack_ball_query_forward( center_xyz, center_xyz_batch_cnt, xyz, xyz_batch_cnt, idx, max_radius=max_radius, nsample=sample_num, ) else: B, N, _ = xyz.size() npoint = center_xyz.size(1) idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int32) ext_module.ball_query_forward( center_xyz, xyz, idx, b=B, n=N, m=npoint, min_radius=min_radius, max_radius=max_radius, nsample=sample_num) if torch.__version__ != 'parrots': ctx.mark_non_differentiable(idx) return idx @staticmethod def backward(ctx, a=None) -> Tuple[None, None, None, None]: return None, None, None, None ball_query = BallQuery.apply ================================================ FILE: mmcv/ops/bbox.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps']) def _bbox_overlaps_cpu(bboxes1: torch.Tensor, bboxes2: torch.Tensor, mode: str = 'iou', aligned: bool = False, offset: int = 0) -> torch.Tensor: assert mode in ['iou', 'iof'] if aligned: lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2] rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2] wh = (rb - lt + offset).clamp(min=0) # [rows, 2] overlap = wh[:, 0] * wh[:, 1] area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * ( bboxes1[:, 3] - bboxes1[:, 1] + offset) if mode == 'iou': area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * ( bboxes2[:, 3] - bboxes2[:, 1] + offset) ious = overlap / (area1 + area2 - overlap) else: ious = overlap / area1 else: lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2] rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2] wh = (rb - lt + offset).clamp(min=0) # [rows, cols, 2] overlap = wh[:, :, 0] * wh[:, :, 1] area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * ( bboxes1[:, 3] - bboxes1[:, 1] + offset) if mode == 'iou': area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * ( bboxes2[:, 3] - bboxes2[:, 1] + offset) ious = overlap / (area1[:, None] + area2 - overlap) else: ious = overlap / (area1[:, None]) return ious def bbox_overlaps(bboxes1: torch.Tensor, bboxes2: torch.Tensor, mode: str = 'iou', aligned: bool = False, offset: int = 0) -> torch.Tensor: """Calculate overlap between two set of bboxes. If ``aligned`` is ``False``, then calculate the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (torch.Tensor): shape (m, 4) in format or empty. bboxes2 (torch.Tensor): shape (n, 4) in format or empty. If aligned is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or iof (intersection over foreground). Returns: torch.Tensor: Return the ious betweens boxes. If ``aligned`` is ``False``, the shape of ious is (m, n) else (m, 1). Example: >>> bboxes1 = torch.FloatTensor([ >>> [0, 0, 10, 10], >>> [10, 10, 20, 20], >>> [32, 32, 38, 42], >>> ]) >>> bboxes2 = torch.FloatTensor([ >>> [0, 0, 10, 20], >>> [0, 10, 10, 19], >>> [10, 10, 20, 20], >>> ]) >>> bbox_overlaps(bboxes1, bboxes2) tensor([[0.5000, 0.0000, 0.0000], [0.0000, 0.0000, 1.0000], [0.0000, 0.0000, 0.0000]]) Example: >>> empty = torch.FloatTensor([]) >>> nonempty = torch.FloatTensor([ >>> [0, 0, 10, 9], >>> ]) >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) """ mode_dict = {'iou': 0, 'iof': 1} assert mode in mode_dict.keys() mode_flag = mode_dict[mode] # Either the boxes are empty or the length of boxes' last dimension is 4 assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0) assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0) assert offset == 1 or offset == 0 rows = bboxes1.size(0) cols = bboxes2.size(0) if aligned: assert rows == cols ious = bboxes1.new_zeros(rows) else: ious = bboxes1.new_zeros((rows, cols)) if rows * cols == 0: return ious if bboxes1.device.type == 'cpu' and torch.__version__ == 'parrots': return _bbox_overlaps_cpu( bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset) ext_module.bbox_overlaps( bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset) return ious ================================================ FILE: mmcv/ops/bezier_align.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple, Union import torch import torch.nn as nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['bezier_align_forward', 'bezier_align_backward']) class BezierAlignFunction(Function): @staticmethod def forward(ctx, input: torch.Tensor, beziers: torch.Tensor, output_size: Union[int, Tuple[int, int]], spatial_scale: Union[int, float] = 1.0, sampling_ratio: int = 0, aligned: bool = True) -> torch.Tensor: ctx.output_size = _pair(output_size) ctx.spatial_scale = spatial_scale ctx.input_shape = input.size() ctx.sampling_ratio = sampling_ratio ctx.aligned = aligned assert beziers.size(1) == 17 output_shape = (beziers.size(0), input.size(1), ctx.output_size[0], ctx.output_size[1]) output = input.new_zeros(output_shape) ext_module.bezier_align_forward( input, beziers, output, aligned_height=ctx.output_size[0], aligned_width=ctx.output_size[1], spatial_scale=ctx.spatial_scale, sampling_ratio=ctx.sampling_ratio, aligned=ctx.aligned) ctx.save_for_backward(beziers) return output @staticmethod @once_differentiable def backward(ctx, grad_output: torch.Tensor): beziers = ctx.saved_tensors[0] grad_input = grad_output.new_zeros(ctx.input_shape) grad_output = grad_output.contiguous() ext_module.bezier_align_backward( grad_output, beziers, grad_input, aligned_height=ctx.output_size[0], aligned_width=ctx.output_size[1], spatial_scale=ctx.spatial_scale, sampling_ratio=ctx.sampling_ratio, aligned=ctx.aligned) return grad_input, None, None, None, None, None bezier_align = BezierAlignFunction.apply class BezierAlign(nn.Module): """Bezier align pooling layer. Args: output_size (tuple): h, w spatial_scale (float): scale the input boxes by this number sampling_ratio (int): number of inputs samples to take for each output sample. 0 to take samples densely for current models. aligned (bool): if False, use the legacy implementation in MMDetection. If True, align the results more perfectly. Note: The implementation of BezierAlign is modified from https://github.com/aim-uofa/AdelaiDet The meaning of aligned=True: Given a continuous coordinate c, its two neighboring pixel indices (in our pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled from the underlying signal at continuous coordinates 0.5 and 1.5). But the original roi_align (aligned=False) does not subtract the 0.5 when computing neighboring pixel indices and therefore it uses pixels with a slightly incorrect alignment (relative to our pixel model) when performing bilinear interpolation. With `aligned=True`, we first appropriately scale the ROI and then shift it by -0.5 prior to calling roi_align. This produces the correct neighbors; The difference does not make a difference to the model's performance if ROIAlign is used together with conv layers. """ def __init__( self, output_size: Tuple, spatial_scale: Union[int, float], sampling_ratio: int, aligned: bool = True, ) -> None: super().__init__() self.output_size = _pair(output_size) self.spatial_scale = float(spatial_scale) self.sampling_ratio = int(sampling_ratio) self.aligned = aligned def forward(self, input: torch.Tensor, beziers: torch.Tensor) -> torch.Tensor: """BezierAlign forward. Args: inputs (Tensor): input features. beziers (Tensor): beziers for align. """ return bezier_align(input, beziers, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned) def __repr__(self): s = self.__class__.__name__ s += f'(output_size={self.output_size}, ' s += f'spatial_scale={self.spatial_scale})' s += f'sampling_ratio={self.sampling_ratio})' s += f'aligned={self.aligned})' return s ================================================ FILE: mmcv/ops/bias_act.py ================================================ # Modified from # https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/bias_act.py # Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # NVIDIA CORPORATION and its licensors retain all intellectual property # and proprietary rights in and to this software, related documentation # and any modifications thereto. Any use, reproduction, disclosure or # distribution of this software and related documentation without an express # license agreement from NVIDIA CORPORATION is strictly prohibited. # source: https://github.com/open-mmlab/mmediting/blob/dev-1.x/mmedit/models/editors/stylegan3/stylegan3_ops/ops/bias_act.py # noqa """Custom PyTorch ops for efficient bias and activation.""" from typing import Any, Dict, Optional, Union import numpy as np import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['bias_act']) class EasyDict(dict): """Convenience class that behaves like a dict but allows access with the attribute syntax.""" def __getattr__(self, name: str) -> Any: try: return self[name] except KeyError: raise AttributeError(name) def __setattr__(self, name: str, value: Any) -> None: self[name] = value def __delattr__(self, name: str) -> None: del self[name] activation_funcs = { 'linear': EasyDict( func=lambda x, **_: x, def_alpha=0, def_gain=1, cuda_idx=1, ref='', has_2nd_grad=False), 'relu': EasyDict( func=lambda x, **_: torch.nn.functional.relu(x), def_alpha=0, def_gain=np.sqrt(2), cuda_idx=2, ref='y', has_2nd_grad=False), 'lrelu': EasyDict( func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha), def_alpha=0.2, def_gain=np.sqrt(2), cuda_idx=3, ref='y', has_2nd_grad=False), 'tanh': EasyDict( func=lambda x, **_: torch.tanh(x), def_alpha=0, def_gain=1, cuda_idx=4, ref='y', has_2nd_grad=True), 'sigmoid': EasyDict( func=lambda x, **_: torch.sigmoid(x), def_alpha=0, def_gain=1, cuda_idx=5, ref='y', has_2nd_grad=True), 'elu': EasyDict( func=lambda x, **_: torch.nn.functional.elu(x), def_alpha=0, def_gain=1, cuda_idx=6, ref='y', has_2nd_grad=True), 'selu': EasyDict( func=lambda x, **_: torch.nn.functional.selu(x), def_alpha=0, def_gain=1, cuda_idx=7, ref='y', has_2nd_grad=True), 'softplus': EasyDict( func=lambda x, **_: torch.nn.functional.softplus(x), def_alpha=0, def_gain=1, cuda_idx=8, ref='y', has_2nd_grad=True), 'swish': EasyDict( func=lambda x, **_: torch.sigmoid(x) * x, def_alpha=0, def_gain=np.sqrt(2), cuda_idx=9, ref='x', has_2nd_grad=True), } activation_funcs_musa = { 'linear': EasyDict( func=lambda x, **_: x, def_alpha=0, def_gain=1, musa_idx=1, ref='', has_2nd_grad=False), 'relu': EasyDict( func=lambda x, **_: torch.nn.functional.relu(x), def_alpha=0, def_gain=np.sqrt(2), musa_idx=2, ref='y', has_2nd_grad=False), 'lrelu': EasyDict( func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha), def_alpha=0.2, def_gain=np.sqrt(2), musa_idx=3, ref='y', has_2nd_grad=False), 'tanh': EasyDict( func=lambda x, **_: torch.tanh(x), def_alpha=0, def_gain=1, musa_idx=4, ref='y', has_2nd_grad=True), 'sigmoid': EasyDict( func=lambda x, **_: torch.sigmoid(x), def_alpha=0, def_gain=1, musa_idx=5, ref='y', has_2nd_grad=True), 'elu': EasyDict( func=lambda x, **_: torch.nn.functional.elu(x), def_alpha=0, def_gain=1, musa_idx=6, ref='y', has_2nd_grad=True), 'selu': EasyDict( func=lambda x, **_: torch.nn.functional.selu(x), def_alpha=0, def_gain=1, musa_idx=7, ref='y', has_2nd_grad=True), 'softplus': EasyDict( func=lambda x, **_: torch.nn.functional.softplus(x), def_alpha=0, def_gain=1, musa_idx=8, ref='y', has_2nd_grad=True), 'swish': EasyDict( func=lambda x, **_: torch.sigmoid(x) * x, def_alpha=0, def_gain=np.sqrt(2), musa_idx=9, ref='x', has_2nd_grad=True), } _null_tensor = torch.empty([0]) def bias_act(input: torch.Tensor, bias: Optional[torch.Tensor] = None, dim: int = 1, act: str = 'linear', alpha: Optional[Union[float, int]] = None, gain: Optional[float] = None, clamp: Optional[float] = None, use_custom_op: bool = True): r"""Fused bias and activation function. Adds `bias` to activation tensor `input`, and evaluates activation function `act`, and scales the result by `gain`. Each of the steps is optional. In most cases, the fused op is considerably more efficient than performing the same calculation using standard PyTorch ops. It supports first and second order gradients, but not third order gradients. Args: input (torch.Tensor): Input activation tensor. Can be of any shape. bias (torch.Tensor): Bias vector, or `None` to disable. Must be a 1D tensor of the same type as `input`. The shape must be known, and it must match the dimension of `input` corresponding to `dim`. Defaults to None. dim (int): The dimension in `input` corresponding to the elements of `bias`. The value of `dim` is ignored if `b` is not specified. Defaults to 1. act (str): Name of the activation function to evaluate, or `"linear"` to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid", "swish", etc. See `activation_funcs` for a full list. `None` is not allowed. Defaults to `linear`. alpha (float or int): Shape parameter for the activation function, or `None` to use the default. Defaults to None. gain (float): Scaling factor for the output tensor, or `None` to use default. See `activation_funcs` for the default scaling of each activation function. If unsure, consider specifying 1. Defaults to None. clamp (float): Clamp the output values to `[-clamp, +clamp]`, or `None` to disable the clamping (default). Defaults to None. use_custom_op (bool): Whether to use customized op. Defaults to True. Returns: torch.Tensor: Tensor of the same shape and datatype as `input`. """ assert isinstance(input, torch.Tensor) if use_custom_op and input.is_cuda: return _bias_act_cuda( dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp).apply(input, bias) try: if use_custom_op and input.is_musa: return _bias_act_musa( dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp).apply(input, bias) except AttributeError: pass return _bias_act_ref( input=input, bias=bias, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp) def _bias_act_ref(input: torch.Tensor, bias: Optional[torch.Tensor] = None, dim: int = 1, act: str = 'linear', alpha: Optional[Union[float, int]] = None, gain: Optional[float] = None, clamp: Optional[float] = None): """Slow reference implementation of `bias_act()` using standard PyTorch ops. Adds `bias` to activation tensor `input`, and evaluates activation function `act`, and scales the result by `gain`. Each of the steps is optional. In most cases, the fused op is considerably more efficient than performing the same calculation using standard PyTorch ops. It supports first and second order gradients, but not third order gradients. Args: input (torch.Tensor): Input activation tensor. Can be of any shape. bias (torch.Tensor): Bias vector, or `None` to disable. Must be a 1D tensor of the same type as `input`. The shape must be known, and it must match the dimension of `input` corresponding to `dim`. Defaults to None. dim (int): The dimension in `input` corresponding to the elements of `bias`. The value of `dim` is ignored if `b` is not specified. Defaults to 1. act (str): Name of the activation function to evaluate, or `"linear"` to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid", "swish", etc. See `activation_funcs` for a full list. `None` is not allowed. Defaults to `linear`. alpha (float or int): Shape parameter for the activation function, or `None` to use the default. Defaults to None. gain (float): Scaling factor for the output tensor, or `None` to use default. See `activation_funcs` for the default scaling of each activation function. If unsure, consider specifying 1. Defaults to None. clamp (float): Clamp the output values to `[-clamp, +clamp]`, or `None` to disable the clamping (default). Defaults to None. Returns: torch.Tensor: Tensor of the same shape and datatype as `input`. """ assert isinstance(input, torch.Tensor) assert clamp is None or clamp >= 0 spec = activation_funcs[act] alpha = float(alpha if alpha is not None else spec.def_alpha) gain = float(gain if gain is not None else spec.def_gain) clamp = float(clamp if clamp is not None else -1) # Add bias. if bias is not None: assert isinstance(bias, torch.Tensor) and bias.ndim == 1 assert 0 <= dim < input.ndim assert bias.shape[0] == input.shape[dim] input = input + bias.reshape( [-1 if i == dim else 1 for i in range(input.ndim)]) # Evaluate activation function. alpha = float(alpha) output = spec.func(input, alpha=alpha) # Scale by gain. gain = float(gain) if gain != 1: output = output * gain # Clamp. if clamp >= 0: # pylint: disable=invalid-unary-operand-type output = output.clamp(-clamp, clamp) return output _bias_act_cuda_cache: Dict = dict() def _bias_act_cuda(dim: int = 1, act: str = 'linear', alpha: Optional[Union[float, int]] = None, gain: Optional[float] = None, clamp: Optional[float] = None): """"Fast CUDA implementation of `bias_act()` using custom ops. Args: dim (int): The dimension in `x` corresponding to the elements of `b`. The value of `dim` is ignored if `b` is not specified. Defaults to 1. act (str): Name of the activation function to evaluate, or `"linear"` to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid", "swish", etc. See `activation_funcs` for a full list. `None` is not allowed. Defaults to `linear`. alpha (float | int): Shape parameter for the activation function, or `None` to use the default. Defaults to None. gain (float): Scaling factor for the output tensor, or `None` to use default. See `activation_funcs` for the default scaling of each activation function. If unsure, consider specifying 1. Defaults to None. clamp (float): Clamp the output values to `[-clamp, +clamp]`, or `None` to disable the clamping (default). Defaults to None. Returns: torch.Tensor: Tensor of the same shape and datatype as `x`. """ # Parse arguments. assert clamp is None or clamp >= 0 spec = activation_funcs[act] alpha = float(alpha if alpha is not None else spec.def_alpha) gain = float(gain if gain is not None else spec.def_gain) clamp = float(clamp if clamp is not None else -1) # Lookup from cache. key = (dim, act, alpha, gain, clamp) if key in _bias_act_cuda_cache: return _bias_act_cuda_cache[key] # Forward op. class BiasActCuda(torch.autograd.Function): @staticmethod def forward(ctx, x, b): # pylint: disable=arguments-differ ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride( 1) == 1 else torch.contiguous_format x = x.contiguous(memory_format=ctx.memory_format) b = b.contiguous() if b is not None else _null_tensor.to(x.device) y = x if act != 'linear' or gain != 1 or clamp >= 0 or ( b is not _null_tensor.to(x.device)): y = ext_module.bias_act(x, b, _null_tensor.to(x.device), _null_tensor.to(x.device), _null_tensor.to(x.device), 0, dim, spec.cuda_idx, alpha, gain, clamp) ctx.save_for_backward( x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to( x.device), b if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(x.device), y if 'y' in spec.ref else _null_tensor.to(x.device)) return y @staticmethod def backward(ctx, dy): # pylint: disable=arguments-differ dy = dy.contiguous(memory_format=ctx.memory_format) x, b, y = ctx.saved_tensors dx = None db = None if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: dx = dy if act != 'linear' or gain != 1 or clamp >= 0: dx = BiasActCudaGrad.apply(dy, x, b, y) if ctx.needs_input_grad[1]: db = dx.sum([i for i in range(dx.ndim) if i != dim]) return dx, db # Backward op. class BiasActCudaGrad(torch.autograd.Function): @staticmethod def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ ctx.memory_format = torch.channels_last if dy.ndim > 2 and ( dy.stride(1) == 1) else torch.contiguous_format dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1, dim, spec.cuda_idx, alpha, gain, clamp) ctx.save_for_backward( dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b, y) return dx @staticmethod def backward(ctx, d_dx): # pylint: disable=arguments-differ d_dx = d_dx.contiguous(memory_format=ctx.memory_format) dy, x, b, y = ctx.saved_tensors d_dy = None d_x = None d_b = None d_y = None if ctx.needs_input_grad[0]: d_dy = BiasActCudaGrad.apply(d_dx, x, b, y) if spec.has_2nd_grad and (ctx.needs_input_grad[1] or ctx.needs_input_grad[2]): d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim, spec.cuda_idx, alpha, gain, clamp) if spec.has_2nd_grad and ctx.needs_input_grad[2]: d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim]) return d_dy, d_x, d_b, d_y # Add to cache. _bias_act_cuda_cache[key] = BiasActCuda return BiasActCuda _bias_act_musa_cache: Dict = dict() def _bias_act_musa(dim: int = 1, act: str = 'linear', alpha: Optional[Union[float, int]] = None, gain: Optional[float] = None, clamp: Optional[float] = None): """"Fast MUSA implementation of `bias_act()` using custom ops. Args: dim (int): The dimension in `x` corresponding to the elements of `b`. The value of `dim` is ignored if `b` is not specified. Defaults to 1. act (str): Name of the activation function to evaluate, or `"linear"` to disable. Can be e.g. "relu", "lrelu", "tanh", "sigmoid", "swish", etc. See `activation_funcs_musa` for a full list. `None` is not allowed. Defaults to `linear`. alpha (float | int): Shape parameter for the activation function, or `None` to use the default. Defaults to None. gain (float): Scaling factor for the output tensor, or `None` to use default. See `activation_funcs_musa` for the default scaling of each activation function. If unsure, consider specifying 1. Defaults to None. clamp (float): Clamp the output values to `[-clamp, +clamp]`, or `None` to disable the clamping (default). Defaults to None. Returns: torch.Tensor: Tensor of the same shape and datatype as `x`. """ # Parse arguments. assert clamp is None or clamp >= 0 spec = activation_funcs_musa[act] alpha = float(alpha if alpha is not None else spec.def_alpha) gain = float(gain if gain is not None else spec.def_gain) clamp = float(clamp if clamp is not None else -1) # Lookup from cache. key = (dim, act, alpha, gain, clamp) if key in _bias_act_musa_cache: return _bias_act_musa_cache[key] # Forward op. class BiasActMusa(torch.autograd.Function): @staticmethod def forward(ctx, x, b): # pylint: disable=arguments-differ ctx.memory_format = torch.channels_last if x.ndim > 2 and x.stride( 1) == 1 else torch.contiguous_format x = x.contiguous(memory_format=ctx.memory_format) b = b.contiguous() if b is not None else _null_tensor.to(x.device) y = x if act != 'linear' or gain != 1 or clamp >= 0 or ( b is not _null_tensor.to(x.device)): y = ext_module.bias_act(x, b, _null_tensor.to(x.device), _null_tensor.to(x.device), _null_tensor.to(x.device), 0, dim, spec.musa_idx, alpha, gain, clamp) ctx.save_for_backward( x if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to( x.device), b if 'x' in spec.ref or spec.has_2nd_grad else _null_tensor.to(x.device), y if 'y' in spec.ref else _null_tensor.to(x.device)) return y @staticmethod def backward(ctx, dy): # pylint: disable=arguments-differ dy = dy.contiguous(memory_format=ctx.memory_format) x, b, y = ctx.saved_tensors dx = None db = None if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: dx = dy if act != 'linear' or gain != 1 or clamp >= 0: dx = BiasActMusaGrad.apply(dy, x, b, y) if ctx.needs_input_grad[1]: db = dx.sum([i for i in range(dx.ndim) if i != dim]) return dx, db # Backward op. class BiasActMusaGrad(torch.autograd.Function): @staticmethod def forward(ctx, dy, x, b, y): # pylint: disable=arguments-differ ctx.memory_format = torch.channels_last if dy.ndim > 2 and ( dy.stride(1) == 1) else torch.contiguous_format dx = ext_module.bias_act(dy, b, x, y, _null_tensor.to(x.device), 1, dim, spec.musa_idx, alpha, gain, clamp) ctx.save_for_backward( dy if spec.has_2nd_grad else _null_tensor.to(x.device), x, b, y) return dx @staticmethod def backward(ctx, d_dx): # pylint: disable=arguments-differ d_dx = d_dx.contiguous(memory_format=ctx.memory_format) dy, x, b, y = ctx.saved_tensors d_dy = None d_x = None d_b = None d_y = None if ctx.needs_input_grad[0]: d_dy = BiasActMusaGrad.apply(d_dx, x, b, y) if spec.has_2nd_grad and (ctx.needs_input_grad[1] or ctx.needs_input_grad[2]): d_x = ext_module.bias_act(d_dx, b, x, y, dy, 2, dim, spec.musa_idx, alpha, gain, clamp) if spec.has_2nd_grad and ctx.needs_input_grad[2]: d_b = d_x.sum([i for i in range(d_x.ndim) if i != dim]) return d_dy, d_x, d_b, d_y # Add to cache. _bias_act_musa_cache[key] = BiasActMusa return BiasActMusa ================================================ FILE: mmcv/ops/border_align.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # modified from # https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py from typing import Tuple import torch import torch.nn as nn from torch.autograd import Function from torch.autograd.function import once_differentiable from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['border_align_forward', 'border_align_backward']) class BorderAlignFunction(Function): @staticmethod def symbolic(g, input, boxes, pool_size): return g.op( 'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size) @staticmethod def forward(ctx, input: torch.Tensor, boxes: torch.Tensor, pool_size: int) -> torch.Tensor: ctx.pool_size = pool_size ctx.input_shape = input.size() assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]' assert boxes.size(2) == 4, \ 'the last dimension of boxes must be (x1, y1, x2, y2)' assert input.size(1) % 4 == 0, \ 'the channel for input feature must be divisible by factor 4' # [B, C//4, H*W, 4] output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4) output = input.new_zeros(output_shape) # `argmax_idx` only used for backward argmax_idx = input.new_zeros(output_shape).to(torch.int) ext_module.border_align_forward( input, boxes, output, argmax_idx, pool_size=ctx.pool_size) ctx.save_for_backward(boxes, argmax_idx) return output @staticmethod @once_differentiable def backward(ctx, grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]: boxes, argmax_idx = ctx.saved_tensors grad_input = grad_output.new_zeros(ctx.input_shape) # complex head architecture may cause grad_output uncontiguous grad_output = grad_output.contiguous() ext_module.border_align_backward( grad_output, boxes, argmax_idx, grad_input, pool_size=ctx.pool_size) return grad_input, None, None border_align = BorderAlignFunction.apply class BorderAlign(nn.Module): r"""Border align pooling layer. Applies border_align over the input feature based on predicted bboxes. The details were described in the paper `BorderDet: Border Feature for Dense Object Detection `_. For each border line (e.g. top, left, bottom or right) of each box, border_align does the following: 1. uniformly samples ``pool_size`` +1 positions on this line, involving the start and end points. 2. the corresponding features on these points are computed by bilinear interpolation. 3. max pooling over all the ``pool_size`` +1 positions are used for computing pooled feature. Args: pool_size (int): number of positions sampled over the boxes' borders (e.g. top, bottom, left, right). """ def __init__(self, pool_size: int): super().__init__() self.pool_size = pool_size def forward(self, input: torch.Tensor, boxes: torch.Tensor) -> torch.Tensor: """ Args: input: Features with shape [N,4C,H,W]. Channels ranged in [0,C), [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, right features respectively. boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2). Returns: torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is (top,left,bottom,right) for the last dimension. """ return border_align(input, boxes, self.pool_size) def __repr__(self): s = self.__class__.__name__ s += f'(pool_size={self.pool_size})' return s ================================================ FILE: mmcv/ops/box_iou_quadri.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['box_iou_quadri']) def box_iou_quadri(bboxes1: torch.Tensor, bboxes2: torch.Tensor, mode: str = 'iou', aligned: bool = False) -> torch.Tensor: """Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in (x1, y1, ..., x4, y4) format. If ``aligned`` is ``False``, then calculate the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (torch.Tensor): quadrilateral bboxes 1. It has shape (N, 8), indicating (x1, y1, ..., x4, y4) for each row. bboxes2 (torch.Tensor): quadrilateral bboxes 2. It has shape (M, 8), indicating (x1, y1, ..., x4, y4) for each row. mode (str): "iou" (intersection over union) or iof (intersection over foreground). Returns: torch.Tensor: Return the ious betweens boxes. If ``aligned`` is ``False``, the shape of ious is (N, M) else (N,). """ assert mode in ['iou', 'iof'] mode_dict = {'iou': 0, 'iof': 1} mode_flag = mode_dict[mode] rows = bboxes1.size(0) cols = bboxes2.size(0) if aligned: ious = bboxes1.new_zeros(rows) else: ious = bboxes1.new_zeros(rows * cols) bboxes1 = bboxes1.contiguous() bboxes2 = bboxes2.contiguous() ext_module.box_iou_quadri( bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned) if not aligned: ious = ious.view(rows, cols) return ious ================================================ FILE: mmcv/ops/box_iou_rotated.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated']) def box_iou_rotated(bboxes1: torch.Tensor, bboxes2: torch.Tensor, mode: str = 'iou', aligned: bool = False, clockwise: bool = True) -> torch.Tensor: """Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in (x_center, y_center, width, height, angle) format. If ``aligned`` is ``False``, then calculate the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. .. note:: The operator assumes: 1) The positive direction along x axis is left -> right. 2) The positive direction along y axis is top -> down. 3) The w border is in parallel with x axis when angle = 0. However, there are 2 opposite definitions of the positive angular direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports both definitions and uses CW by default. Please set ``clockwise=False`` if you are using the CCW definition. The coordinate system when ``clockwise`` is ``True`` (default) .. code-block:: none 0-------------------> x (0 rad) | A-------------B | | | | | box h | | angle=0 | | D------w------C v y (pi/2 rad) In such coordination system the rotation matrix is .. math:: \\begin{pmatrix} \\cos\\alpha & -\\sin\\alpha \\\\ \\sin\\alpha & \\cos\\alpha \\end{pmatrix} The coordinates of the corner point A can be calculated as: .. math:: P_A= \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix} = \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} + \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\ \\sin\\alpha & \\cos\\alpha\\end{pmatrix} \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\ = \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha \\\\ y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix} The coordinate system when ``clockwise`` is ``False`` .. code-block:: none 0-------------------> x (0 rad) | A-------------B | | | | | box h | | angle=0 | | D------w------C v y (-pi/2 rad) In such coordination system the rotation matrix is .. math:: \\begin{pmatrix} \\cos\\alpha & \\sin\\alpha \\\\ -\\sin\\alpha & \\cos\\alpha \\end{pmatrix} The coordinates of the corner point A can be calculated as: .. math:: P_A= \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix} = \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} + \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\ -\\sin\\alpha & \\cos\\alpha\\end{pmatrix} \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\ = \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha \\\\ y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix} Args: boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5), indicating (x, y, w, h, theta) for each row. Note that theta is in radian. boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5), indicating (x, y, w, h, theta) for each row. Note that theta is in radian. mode (str): "iou" (intersection over union) or iof (intersection over foreground). clockwise (bool): flag indicating whether the positive angular orientation is clockwise. default True. `New in version 1.4.3.` Returns: torch.Tensor: Return the ious betweens boxes. If ``aligned`` is ``False``, the shape of ious is (N, M) else (N,). """ assert mode in ['iou', 'iof'] mode_dict = {'iou': 0, 'iof': 1} mode_flag = mode_dict[mode] rows = bboxes1.size(0) cols = bboxes2.size(0) if aligned: ious = bboxes1.new_zeros(rows) else: if bboxes1.device.type == 'mlu': ious = bboxes1.new_zeros([rows, cols]) else: ious = bboxes1.new_zeros(rows * cols) if not clockwise: flip_mat = bboxes1.new_ones(bboxes1.shape[-1]) flip_mat[-1] = -1 bboxes1 = bboxes1 * flip_mat bboxes2 = bboxes2 * flip_mat bboxes1 = bboxes1.contiguous() bboxes2 = bboxes2.contiguous() ext_module.box_iou_rotated( bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned) if not aligned: ious = ious.view(rows, cols) return ious ================================================ FILE: mmcv/ops/carafe.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple import torch import torch.nn as nn import torch.nn.functional as F from mmengine.model import normal_init, xavier_init from mmengine.registry import MODELS from torch import Tensor from torch.autograd import Function from torch.nn.modules.module import Module from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', [ 'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward', 'carafe_backward' ]) class CARAFENaiveFunction(Function): @staticmethod def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int, group_size: int, scale_factor: int) -> Tensor: return g.op( 'mmcv::MMCVCARAFENaive', features, masks, kernel_size_i=kernel_size, group_size_i=group_size, scale_factor_f=scale_factor) @staticmethod def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int, group_size: int, scale_factor: int) -> Tensor: assert scale_factor >= 1 assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(-1) == features.size(-1) * scale_factor assert masks.size(-2) == features.size(-2) * scale_factor assert features.size(1) % group_size == 0 assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 ctx.kernel_size = kernel_size ctx.group_size = group_size ctx.scale_factor = scale_factor ctx.feature_size = features.size() ctx.mask_size = masks.size() n, c, h, w = features.size() output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) ext_module.carafe_naive_forward( features, masks, output, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) if features.requires_grad or masks.requires_grad or \ torch.__version__ == 'parrots': ctx.save_for_backward(features, masks) return output @staticmethod def backward( ctx, grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]: assert grad_output.is_cuda or grad_output.is_musa features, masks = ctx.saved_tensors kernel_size = ctx.kernel_size group_size = ctx.group_size scale_factor = ctx.scale_factor grad_input = torch.zeros_like(features) grad_masks = torch.zeros_like(masks) ext_module.carafe_naive_backward( grad_output.contiguous(), features, masks, grad_input, grad_masks, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) return grad_input, grad_masks, None, None, None carafe_naive = CARAFENaiveFunction.apply class CARAFENaive(Module): def __init__(self, kernel_size: int, group_size: int, scale_factor: int): super().__init__() assert isinstance(kernel_size, int) and isinstance( group_size, int) and isinstance(scale_factor, int) self.kernel_size = kernel_size self.group_size = group_size self.scale_factor = scale_factor def forward(self, features: Tensor, masks: Tensor) -> Tensor: return carafe_naive(features, masks, self.kernel_size, self.group_size, self.scale_factor) class CARAFEFunction(Function): @staticmethod def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int, group_size: int, scale_factor: int) -> Tensor: return g.op( 'mmcv::MMCVCARAFE', features, masks, kernel_size_i=kernel_size, group_size_i=group_size, scale_factor_f=scale_factor) @staticmethod def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int, group_size: int, scale_factor: int) -> Tensor: assert scale_factor >= 1 assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(-1) == features.size(-1) * scale_factor assert masks.size(-2) == features.size(-2) * scale_factor assert features.size(1) % group_size == 0 assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 ctx.kernel_size = kernel_size ctx.group_size = group_size ctx.scale_factor = scale_factor ctx.feature_size = features.size() ctx.mask_size = masks.size() n, c, h, w = features.size() output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) routput = features.new_zeros(output.size(), requires_grad=False) rfeatures = features.new_zeros(features.size(), requires_grad=False) rmasks = masks.new_zeros(masks.size(), requires_grad=False) ext_module.carafe_forward( features, masks, rfeatures, routput, rmasks, output, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) if features.requires_grad or masks.requires_grad or \ torch.__version__ == 'parrots': ctx.save_for_backward(features, masks, rfeatures) return output @staticmethod def backward( ctx, grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]: features, masks, rfeatures = ctx.saved_tensors kernel_size = ctx.kernel_size group_size = ctx.group_size scale_factor = ctx.scale_factor rgrad_output = torch.zeros_like(grad_output, requires_grad=False) rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False) rgrad_input = torch.zeros_like(features, requires_grad=False) rgrad_masks = torch.zeros_like(masks, requires_grad=False) grad_input = torch.zeros_like(features, requires_grad=False) grad_masks = torch.zeros_like(masks, requires_grad=False) ext_module.carafe_backward( grad_output.contiguous(), rfeatures, masks, rgrad_output, rgrad_input_hs, rgrad_input, rgrad_masks, grad_input, grad_masks, kernel_size=kernel_size, group_size=group_size, scale_factor=scale_factor) return grad_input, grad_masks, None, None, None carafe = CARAFEFunction.apply class CARAFE(Module): """ CARAFE: Content-Aware ReAssembly of FEatures Please refer to `CARAFE: Content-Aware ReAssembly of FEatures `_ for more details. Args: kernel_size (int): reassemble kernel size group_size (int): reassemble group size scale_factor (int): upsample ratio Returns: upsampled feature map """ def __init__(self, kernel_size: int, group_size: int, scale_factor: int): super().__init__() assert isinstance(kernel_size, int) and isinstance( group_size, int) and isinstance(scale_factor, int) self.kernel_size = kernel_size self.group_size = group_size self.scale_factor = scale_factor def forward(self, features: Tensor, masks: Tensor) -> Tensor: return carafe(features, masks, self.kernel_size, self.group_size, self.scale_factor) @MODELS.register_module(name='carafe') class CARAFEPack(nn.Module): """A unified package of CARAFE upsampler that contains: 1) channel compressor 2) content encoder 3) CARAFE op. Official implementation of ICCV 2019 paper `CARAFE: Content-Aware ReAssembly of FEatures `_. Args: channels (int): input feature channels scale_factor (int): upsample ratio up_kernel (int): kernel size of CARAFE op up_group (int): group size of CARAFE op encoder_kernel (int): kernel size of content encoder encoder_dilation (int): dilation of content encoder compressed_channels (int): output channels of channels compressor Returns: upsampled feature map """ def __init__(self, channels: int, scale_factor: int, up_kernel: int = 5, up_group: int = 1, encoder_kernel: int = 3, encoder_dilation: int = 1, compressed_channels: int = 64): super().__init__() self.channels = channels self.scale_factor = scale_factor self.up_kernel = up_kernel self.up_group = up_group self.encoder_kernel = encoder_kernel self.encoder_dilation = encoder_dilation self.compressed_channels = compressed_channels self.channel_compressor = nn.Conv2d(channels, self.compressed_channels, 1) self.content_encoder = nn.Conv2d( self.compressed_channels, self.up_kernel * self.up_kernel * self.up_group * self.scale_factor * self.scale_factor, self.encoder_kernel, padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2), dilation=self.encoder_dilation, groups=1) self.init_weights() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): xavier_init(m, distribution='uniform') normal_init(self.content_encoder, std=0.001) def kernel_normalizer(self, mask: Tensor) -> Tensor: mask = F.pixel_shuffle(mask, self.scale_factor) n, mask_c, h, w = mask.size() # use float division explicitly, # to void inconsistency while exporting to onnx mask_channel = int(mask_c / float(self.up_kernel**2)) mask = mask.view(n, mask_channel, -1, h, w) mask = F.softmax(mask, dim=2, dtype=mask.dtype) mask = mask.view(n, mask_c, h, w).contiguous() return mask def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor: x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor) return x def forward(self, x: Tensor) -> Tensor: compressed_x = self.channel_compressor(x) mask = self.content_encoder(compressed_x) mask = self.kernel_normalizer(mask) x = self.feature_reassemble(x, mask) return x ================================================ FILE: mmcv/ops/cc_attention.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from mmengine.registry import MODELS from mmcv.cnn import Scale def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor: """Returns a diagonal matrix of size [n, n]. The diagonal are all "-inf". This is for avoiding calculating the overlapped element in the Criss-Cross twice. """ return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0) @MODELS.register_module() class CrissCrossAttention(nn.Module): """Criss-Cross Attention Module. .. note:: Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch to a pure PyTorch and equivalent implementation. For more details, please refer to https://github.com/open-mmlab/mmcv/pull/1201. Speed comparison for one forward pass - Input size: [2,512,97,97] - Device: 1 NVIDIA GeForce RTX 2080 Ti +-----------------------+---------------+------------+---------------+ | |PyTorch version|CUDA version|Relative speed | +=======================+===============+============+===============+ |with torch.no_grad() |0.00554402 s |0.0299619 s |5.4x | +-----------------------+---------------+------------+---------------+ |no with torch.no_grad()|0.00562803 s |0.0301349 s |5.4x | +-----------------------+---------------+------------+---------------+ Args: in_channels (int): Channels of the input feature map. """ def __init__(self, in_channels: int) -> None: super().__init__() self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1) self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1) self.value_conv = nn.Conv2d(in_channels, in_channels, 1) self.gamma = Scale(0.) self.in_channels = in_channels def forward(self, x: torch.Tensor) -> torch.Tensor: """Forward function of Criss-Cross Attention. Args: x (torch.Tensor): Input feature with the shape of (batch_size, in_channels, height, width). Returns: torch.Tensor: Output of the layer, with the shape of (batch_size, in_channels, height, width) """ B, C, H, W = x.size() query = self.query_conv(x) key = self.key_conv(x) value = self.value_conv(x) energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG( H, query.device) energy_H = energy_H.transpose(1, 2) energy_W = torch.einsum('bchw,bchj->bhwj', query, key) attn = F.softmax( torch.cat([energy_H, energy_W], dim=-1), dim=-1) # [B,H,W,(H+W)] out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H]) out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:]) out = self.gamma(out) + x out = out.contiguous() return out def __repr__(self) -> str: s = self.__class__.__name__ s += f'(in_channels={self.in_channels})' return s ================================================ FILE: mmcv/ops/chamfer_distance.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Sequence, Tuple import torch from torch import Tensor from torch.autograd import Function from torch.autograd.function import once_differentiable from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward']) class ChamferDistanceFunction(Function): """This is an implementation of the 2D Chamfer Distance. It has been used in the paper `Oriented RepPoints for Aerial Object Detection (CVPR 2022) _`. """ @staticmethod def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]: """ Args: xyz1 (Tensor): Point set with shape (B, N, 2). xyz2 (Tensor): Point set with shape (B, N, 2). Returns: Sequence[Tensor]: - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with shape (B, N). - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with shape (B, N). - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2) with shape (B, N), which be used in compute gradient. - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2) with shape (B, N), which be used in compute gradient. """ batch_size, n, _ = xyz1.size() _, m, _ = xyz2.size() device = xyz1.device xyz1 = xyz1.contiguous() xyz2 = xyz2.contiguous() dist1 = torch.zeros(batch_size, n).type(xyz1.dtype).to(device) dist2 = torch.zeros(batch_size, m).type(xyz2.dtype).to(device) idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device) idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device) ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, idx2) ctx.save_for_backward(xyz1, xyz2, idx1, idx2) return dist1, dist2, idx1, idx2 @staticmethod @once_differentiable def backward(ctx, grad_dist1: Tensor, grad_dist2: Tensor, grad_idx1=None, grad_idx2=None) -> Tuple[Tensor, Tensor]: """ Args: grad_dist1 (Tensor): Gradient of chamfer distance (xyz1 to xyz2) with shape (B, N). grad_dist2 (Tensor): Gradient of chamfer distance (xyz2 to xyz1) with shape (B, N). Returns: Tuple[Tensor, Tensor]: - grad_xyz1 (Tensor): Gradient of the point set with shape \ (B, N, 2). - grad_xyz2 (Tensor):Gradient of the point set with shape \ (B, N, 2). """ xyz1, xyz2, idx1, idx2 = ctx.saved_tensors device = grad_dist1.device grad_dist1 = grad_dist1.contiguous() grad_dist2 = grad_dist2.contiguous() grad_xyz1 = torch.zeros(xyz1.size()).type(xyz1.dtype).to(device) grad_xyz2 = torch.zeros(xyz2.size()).type(xyz2.dtype).to(device) ext_module.chamfer_distance_backward(xyz1, xyz2, idx1, idx2, grad_dist1, grad_dist2, grad_xyz1, grad_xyz2) return grad_xyz1, grad_xyz2 chamfer_distance = ChamferDistanceFunction.apply ================================================ FILE: mmcv/ops/contour_expand.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Union import numpy as np import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['contour_expand']) def contour_expand(kernel_mask: Union[np.array, torch.Tensor], internal_kernel_label: Union[np.array, torch.Tensor], min_kernel_area: int, kernel_num: int) -> list: """Expand kernel contours so that foreground pixels are assigned into instances. Args: kernel_mask (np.array or torch.Tensor): The instance kernel mask with size hxw. internal_kernel_label (np.array or torch.Tensor): The instance internal kernel label with size hxw. min_kernel_area (int): The minimum kernel area. kernel_num (int): The instance kernel number. Returns: list: The instance index map with size hxw. """ assert isinstance(kernel_mask, (torch.Tensor, np.ndarray)) assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray)) assert isinstance(min_kernel_area, int) assert isinstance(kernel_num, int) if isinstance(kernel_mask, np.ndarray): kernel_mask = torch.from_numpy(kernel_mask) if isinstance(internal_kernel_label, np.ndarray): internal_kernel_label = torch.from_numpy(internal_kernel_label) if torch.__version__ == 'parrots': if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0: label = [] else: label = ext_module.contour_expand( kernel_mask, internal_kernel_label, min_kernel_area=min_kernel_area, kernel_num=kernel_num) label = label.tolist() # type: ignore else: label = ext_module.contour_expand(kernel_mask, internal_kernel_label, min_kernel_area, kernel_num) return label ================================================ FILE: mmcv/ops/conv2d_gradfix.py ================================================ # Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # NVIDIA CORPORATION and its licensors retain all intellectual property # and proprietary rights in and to this software, related documentation # and any modifications thereto. Any use, reproduction, disclosure or # distribution of this software and related documentation without an express # license agreement from NVIDIA CORPORATION is strictly prohibited. # source: https://github.com/NVlabs/stylegan3/blob/main/torch_utils/ops/conv2d_gradfix.py # noqa """Custom replacement for `torch.nn.functional.conv2d` that supports arbitrarily high order gradients with zero performance penalty.""" import contextlib import warnings from typing import Dict, Optional, Tuple, Union import torch from mmengine.device import is_musa_available from mmengine.utils import digit_version from mmengine.utils.dl_utils.parrots_wrapper import is_rocm_pytorch enabled = True weight_gradients_disabled = False @contextlib.contextmanager def no_weight_gradients(disable=True): global weight_gradients_disabled old = weight_gradients_disabled if disable: weight_gradients_disabled = True yield weight_gradients_disabled = old def conv2d(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Union[int, Tuple[int, ...]] = 1, padding: Union[int, Tuple[int, ...]] = 0, dilation: Union[int, Tuple[int, ...]] = 1, groups: int = 1): flag = True if digit_version(torch.__version__) >= digit_version('1.10.0'): warnings.warn('Since ' 'aten:cudnn_convolution_backward_weight is ' f'not supported in torch=={torch.__version__},' ' rolling back to `torch.nn.functional.conv2d`') flag = False if _should_use_custom_op(input) and flag: return _conv2d_gradfix( transpose=False, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=0, dilation=dilation, groups=groups).apply(input, weight, bias) return torch.nn.functional.conv2d( input=input, weight=weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups) def conv_transpose2d(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Union[int, Tuple[int, ...]] = 1, padding: Union[int, Tuple[int, ...]] = 0, output_padding: Union[int, Tuple[int, ...]] = 0, groups: int = 1, dilation: Union[int, Tuple[int, ...]] = 1): if _should_use_custom_op(input): return _conv2d_gradfix( transpose=True, weight_shape=weight.shape, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation).apply(input, weight, bias) return torch.nn.functional.conv_transpose2d( input=input, weight=weight, bias=bias, stride=stride, padding=padding, output_padding=output_padding, groups=groups, dilation=dilation) def _should_use_custom_op(input): assert isinstance(input, torch.Tensor) if enabled and is_musa_available(): return True if (not enabled) or (not torch.backends.cudnn.enabled): return False if input.device.type != 'cuda': return False return True def _to_tuple(x, ndim): xs = tuple(x) if isinstance(x, (tuple, list)) else (x, ) * ndim assert len(xs) == ndim assert all(isinstance(x, int) for x in xs) return xs _conv2d_gradfix_cache: Dict = dict() _null_tensor = torch.empty([0]) def _conv2d_gradfix( transpose: bool, weight_shape: Tuple[int, ...], stride: Union[int, Tuple[int, ...]], padding: Union[int, Tuple[int, ...]], output_padding: Union[int, Tuple[int, ...]], dilation: Union[int, Tuple[int, ...]], groups: int, ): # Parse arguments. ndim = 2 weight_shape = tuple(weight_shape) stride = _to_tuple(stride, ndim) padding = _to_tuple(padding, ndim) output_padding = _to_tuple(output_padding, ndim) dilation = _to_tuple(dilation, ndim) # Lookup from cache. key = (transpose, weight_shape, stride, padding, output_padding, dilation, groups) if key in _conv2d_gradfix_cache: return _conv2d_gradfix_cache[key] # Validate arguments. assert groups >= 1 assert len(weight_shape) == ndim + 2 assert all(stride[i] >= 1 for i in range(ndim)) # type: ignore assert all(padding[i] >= 0 for i in range(ndim)) # type: ignore assert all(dilation[i] >= 0 for i in range(ndim)) # type: ignore if not transpose: assert all(output_padding[i] == 0 for i in range(ndim)) # type: ignore else: # transpose for i in range(ndim): assert 0 <= output_padding[i] < max( # type: ignore stride[i], # type: ignore dilation[i]) # type: ignore # Helpers. common_kwargs = dict( stride=stride, padding=padding, dilation=dilation, groups=groups) def calc_output_padding(input_shape, output_shape): if transpose: return [0, 0] return [ input_shape[i + 2] - (output_shape[i + 2] - 1) * stride[i] - (1 - 2 * padding[i]) - dilation[i] * (weight_shape[i + 2] - 1) for i in range(ndim) ] # Forward & backward. class Conv2d(torch.autograd.Function): @staticmethod def forward(ctx, input, weight, bias): assert weight.shape == weight_shape ctx.save_for_backward( input if weight.requires_grad else _null_tensor, weight if input.requires_grad else _null_tensor, ) ctx.input_shape = input.shape # Simple 1x1 convolution => cuBLAS (only on Volta, not on Ampere). if (not is_musa_available() ) and weight_shape[2:] == stride == dilation == ( 1, 1) and padding == ( 0, 0) and torch.cuda.get_device_capability( input.device) < (8, 0): a = weight.reshape(groups, weight_shape[0] // groups, weight_shape[1]) b = input.reshape(input.shape[0], groups, input.shape[1] // groups, -1) c = (a.transpose(1, 2) if transpose else a) @ b.permute( 1, 2, 0, 3).flatten(2) c = c.reshape(-1, input.shape[0], *input.shape[2:]).transpose(0, 1) c = c if bias is None else c + bias.unsqueeze(0).unsqueeze( 2).unsqueeze(3) return c.contiguous( memory_format=(torch.channels_last if input.stride(1) == 1 else torch.contiguous_format)) # General case => cuDNN. if transpose: return torch.nn.functional.conv_transpose2d( input=input, weight=weight, bias=bias, output_padding=output_padding, **common_kwargs) return torch.nn.functional.conv2d( input=input, weight=weight, bias=bias, **common_kwargs) @staticmethod def backward(ctx, grad_output): input, weight = ctx.saved_tensors input_shape = ctx.input_shape grad_input = None grad_weight = None grad_bias = None if ctx.needs_input_grad[0]: p = calc_output_padding( input_shape=input_shape, output_shape=grad_output.shape) op = _conv2d_gradfix( transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs) grad_input = op.apply(grad_output, weight, None) assert grad_input.shape == input_shape if ctx.needs_input_grad[1] and not weight_gradients_disabled: grad_weight = Conv2dGradWeight.apply(grad_output, input) assert grad_weight.shape == weight_shape if ctx.needs_input_grad[2]: grad_bias = grad_output.sum([0, 2, 3]) return grad_input, grad_weight, grad_bias # Gradient with respect to the weights. class Conv2dGradWeight(torch.autograd.Function): @staticmethod def forward(ctx, grad_output, input): ctx.save_for_backward( grad_output if input.requires_grad else _null_tensor, input if grad_output.requires_grad else _null_tensor, ) ctx.grad_output_shape = grad_output.shape ctx.input_shape = input.shape # Simple 1x1 convolution => cuBLAS (on both Volta and Ampere). if weight_shape[2:] == stride == dilation == ( 1, 1) and padding == (0, 0): a = grad_output.reshape(grad_output.shape[0], groups, grad_output.shape[1] // groups, -1).permute(1, 2, 0, 3).flatten(2) b = input.reshape(input.shape[0], groups, input.shape[1] // groups, -1).permute(1, 2, 0, 3).flatten(2) c = (b @ a.transpose(1, 2) if transpose else a @ b.transpose(1, 2)).reshape(weight_shape) return c.contiguous( memory_format=(torch.channels_last if input.stride(1) == 1 else torch.contiguous_format)) # PyTorch consolidated convolution backward API in PR: # https://github.com/pytorch/pytorch/commit/3dc3651e0ee3623f669c3a2c096408dbc476d122 # noqa: E501 # Enhance the code referring to the discussion: # https://github.com/pytorch/pytorch/issues/74437 if digit_version(torch.__version__) >= digit_version('1.11.0'): empty_weight = torch.tensor( 0.0, dtype=input.dtype, device=input.device).expand(weight_shape) output_padding = calc_output_padding(input.shape, grad_output.shape) return torch.ops.aten.convolution_backward( grad_output, input, empty_weight, None, stride=stride, dilation=dilation, transposed=transpose, padding=padding, groups=groups, output_padding=output_padding, output_mask=[0, 1, 0])[1] else: if is_rocm_pytorch(): name = 'aten::miopen_convolution_transpose_backward_weight' if not transpose: name = 'aten::miopen_convolution_backward_weight' flags = [ torch.backends.cudnn.benchmark, torch.backends.cudnn.deterministic ] else: # General case => cuDNN. name = ('aten::cudnn_convolution_transpose_backward_weight' if transpose else 'aten::cudnn_convolution_backward_weight') flags = [ torch.backends.cudnn.benchmark, torch.backends.cudnn.deterministic, torch.backends.cudnn.allow_tf32 ] return torch._C._jit_get_operation(name)(weight_shape, grad_output, input, padding, stride, dilation, groups, *flags) @staticmethod def backward(ctx, grad2_grad_weight): grad_output, input = ctx.saved_tensors grad_output_shape = ctx.grad_output_shape input_shape = ctx.input_shape grad2_grad_output = None grad2_input = None if ctx.needs_input_grad[0]: grad2_grad_output = Conv2d.apply(input, grad2_grad_weight, None) assert grad2_grad_output.shape == grad_output_shape if ctx.needs_input_grad[1]: p = calc_output_padding( input_shape=input_shape, output_shape=grad_output_shape) op = _conv2d_gradfix( transpose=(not transpose), weight_shape=weight_shape, output_padding=p, **common_kwargs) grad2_input = op.apply(grad_output, grad2_grad_weight, None) assert grad2_input.shape == input_shape return grad2_grad_output, grad2_input _conv2d_gradfix_cache[key] = Conv2d return Conv2d ================================================ FILE: mmcv/ops/convex_iou.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple import torch from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou']) def convex_giou(pointsets: torch.Tensor, polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Return generalized intersection-over-union (Jaccard index) between point sets and polygons. Args: pointsets (torch.Tensor): It has shape (N, 18), indicating (x1, y1, x2, y2, ..., x9, y9) for each row. polygons (torch.Tensor): It has shape (N, 8), indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row. Returns: tuple[torch.Tensor, torch.Tensor]: The first element is the gious between point sets and polygons with the shape (N,). The second element is the gradient of point sets with the shape (N, 18). """ output = pointsets.new_zeros((pointsets.size(0), 19)) ext_module.convex_giou(pointsets, polygons, output) convex_giou = output[:, -1] points_grad = output[:, 0:-1] return convex_giou, points_grad def convex_iou(pointsets: torch.Tensor, polygons: torch.Tensor) -> torch.Tensor: """Return intersection-over-union (Jaccard index) between point sets and polygons. Args: pointsets (torch.Tensor): It has shape (N, 18), indicating (x1, y1, x2, y2, ..., x9, y9) for each row. polygons (torch.Tensor): It has shape (K, 8), indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row. Returns: torch.Tensor: Return the ious between point sets and polygons with the shape (N, K). """ N, K = pointsets.size(0), polygons.size(0) ious = pointsets.new_zeros((N, K)) ext_module.convex_iou(pointsets, polygons, ious) return ious ================================================ FILE: mmcv/ops/corner_pool.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmengine.utils import digit_version from torch import Tensor, nn _mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor: size = x.size(dim) output = x.clone() ind = 1 while ind < size: if flip: cur_start = 0 cur_len = size - ind next_start = ind next_len = size - ind else: cur_start = ind cur_len = size - ind next_start = 0 next_len = size - ind # max_temp should be cloned for backward computation max_temp = output.narrow(dim, cur_start, cur_len).clone() cur_temp = output.narrow(dim, cur_start, cur_len) next_temp = output.narrow(dim, next_start, next_len) cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp) ind = ind << 1 return output class CornerPool(nn.Module): """Corner Pooling. Corner Pooling is a new type of pooling layer that helps a convolutional network better localize corners of bounding boxes. Please refer to `CornerNet: Detecting Objects as Paired Keypoints `_ for more details. Code is modified from https://github.com/princeton-vl/CornerNet-Lite. Args: mode (str): Pooling orientation for the pooling layer - 'bottom': Bottom Pooling - 'left': Left Pooling - 'right': Right Pooling - 'top': Top Pooling Returns: Feature map after pooling. """ cummax_dim_flip = { 'bottom': (2, False), 'left': (3, True), 'right': (3, False), 'top': (2, True), } def __init__(self, mode: str): super().__init__() assert mode in self.cummax_dim_flip self.mode = mode def forward(self, x: Tensor) -> Tensor: if (torch.__version__ != 'parrots' and digit_version(torch.__version__) >= digit_version('1.5.0')): dim, flip = self.cummax_dim_flip[self.mode] if flip: x = x.flip(dim) pool_tensor, _ = torch.cummax(x, dim=dim) if flip: pool_tensor = pool_tensor.flip(dim) return pool_tensor else: dim, flip = self.cummax_dim_flip[self.mode] return _corner_pool(x, dim, flip) ================================================ FILE: mmcv/ops/correlation.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import Tuple import torch from torch import Tensor, nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from ..utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['correlation_forward', 'correlation_backward']) class CorrelationFunction(Function): @staticmethod def forward(ctx, input1: Tensor, input2: Tensor, kernel_size: int = 1, max_displacement: int = 1, stride: int = 1, padding: int = 1, dilation: int = 1, dilation_patch: int = 1) -> Tensor: ctx.save_for_backward(input1, input2) kH, kW = ctx.kernel_size = _pair(kernel_size) patch_size = max_displacement * 2 + 1 ctx.patch_size = patch_size dH, dW = ctx.stride = _pair(stride) padH, padW = ctx.padding = _pair(padding) dilationH, dilationW = ctx.dilation = _pair(dilation) dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair( dilation_patch) output_size = CorrelationFunction._output_size(ctx, input1) output = input1.new_zeros(output_size) ext_module.correlation_forward( input1, input2, output, kH=kH, kW=kW, patchH=patch_size, patchW=patch_size, padH=padH, padW=padW, dilationH=dilationH, dilationW=dilationW, dilation_patchH=dilation_patchH, dilation_patchW=dilation_patchW, dH=dH, dW=dW) return output @staticmethod @once_differentiable def backward( ctx, grad_output: Tensor ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]: input1, input2 = ctx.saved_tensors kH, kW = ctx.kernel_size patch_size = ctx.patch_size padH, padW = ctx.padding dilationH, dilationW = ctx.dilation dilation_patchH, dilation_patchW = ctx.dilation_patch dH, dW = ctx.stride grad_input1 = torch.zeros_like(input1) grad_input2 = torch.zeros_like(input2) ext_module.correlation_backward( grad_output, input1, input2, grad_input1, grad_input2, kH=kH, kW=kW, patchH=patch_size, patchW=patch_size, padH=padH, padW=padW, dilationH=dilationH, dilationW=dilationW, dilation_patchH=dilation_patchH, dilation_patchW=dilation_patchW, dH=dH, dW=dW) return grad_input1, grad_input2, None, None, None, None, None, None @staticmethod def _output_size(ctx, input1): iH, iW = input1.size(2), input1.size(3) batch_size = input1.size(0) kH, kW = ctx.kernel_size patch_size = ctx.patch_size dH, dW = ctx.stride padH, padW = ctx.padding dilationH, dilationW = ctx.dilation dilatedKH = (kH - 1) * dilationH + 1 dilatedKW = (kW - 1) * dilationW + 1 oH = int((iH + 2 * padH - dilatedKH) / dH + 1) oW = int((iW + 2 * padW - dilatedKW) / dW + 1) output_size = (batch_size, patch_size, patch_size, oH, oW) return output_size class Correlation(nn.Module): r"""Correlation operator. This correlation operator works for optical flow correlation computation. There are two batched tensors with shape :math:`(N, C, H, W)`, and the correlation output's shape is :math:`(N, max\_displacement \times 2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})` where .. math:: H_{out} = \left\lfloor\frac{H_{in} + 2 \times padding - dilation \times (kernel\_size - 1) - 1} {stride} + 1\right\rfloor .. math:: W_{out} = \left\lfloor\frac{W_{in} + 2 \times padding - dilation \times (kernel\_size - 1) - 1} {stride} + 1\right\rfloor the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding window convolution between input1 and shifted input2, .. math:: Corr(N_i, dx, dy) = \sum_{c=0}^{C-1} input1(N_i, c) \star \mathcal{S}(input2(N_i, c), dy, dx) where :math:`\star` is the valid 2d sliding window convolution operator, and :math:`\mathcal{S}` means shifting the input features (auto-complete zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in [-max\_displacement \times dilation\_patch, max\_displacement \times dilation\_patch]`. Args: kernel_size (int): The size of sliding window i.e. local neighborhood representing the center points and involved in correlation computation. Defaults to 1. max_displacement (int): The radius for computing correlation volume, but the actual working space can be dilated by dilation_patch. Defaults to 1. stride (int): The stride of the sliding blocks in the input spatial dimensions. Defaults to 1. padding (int): Zero padding added to all four sides of the input1. Defaults to 0. dilation (int): The spacing of local neighborhood that will involved in correlation. Defaults to 1. dilation_patch (int): The spacing between position need to compute correlation. Defaults to 1. """ def __init__(self, kernel_size: int = 1, max_displacement: int = 1, stride: int = 1, padding: int = 0, dilation: int = 1, dilation_patch: int = 1) -> None: super().__init__() self.kernel_size = kernel_size self.max_displacement = max_displacement self.stride = stride self.padding = padding self.dilation = dilation self.dilation_patch = dilation_patch def forward(self, input1: Tensor, input2: Tensor) -> Tensor: return CorrelationFunction.apply(input1, input2, self.kernel_size, self.max_displacement, self.stride, self.padding, self.dilation, self.dilation_patch) def __repr__(self) -> str: s = self.__class__.__name__ s += f'(kernel_size={self.kernel_size}, ' s += f'max_displacement={self.max_displacement}, ' s += f'stride={self.stride}, ' s += f'padding={self.padding}, ' s += f'dilation={self.dilation}, ' s += f'dilation_patch={self.dilation_patch})' return s ================================================ FILE: mmcv/ops/csrc/README.md ================================================ # Code Structure of CUDA operators This folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops. ## Directories Tree ```folder . ├── common │ ├── box_iou_rotated_utils.hpp │ ├── parrots_cpp_helper.hpp │ ├── parrots_cuda_helper.hpp │ ├── pytorch_cpp_helper.hpp │ ├── pytorch_cuda_helper.hpp │ ├── pytorch_device_registry.hpp │   ├── cuda │   │ ├── common_cuda_helper.hpp │   │ ├── parrots_cudawarpfunction.cuh │   │ ├── ... │   │ └── ops_cuda_kernel.cuh |   ├── mps │   │ ├── MPSLibrary.h │   │ ├── ... │   │ └── MPSUtils.h |   ├── mlu │   │ └── ... |   └── utils │   │ └── ... ├── parrots │   ├── ... │   ├── ops.cpp │   ├── ops_parrots.cpp │   └── ops_pytorch.h └── pytorch     ├── info.cpp     ├── pybind.cpp     ├── ...     ├── ops.cpp     ├── cuda     │   ├── ...     │   └── ops_cuda.cu     ├── cpu     │   ├── ...     │   └── ops.cpp     ├── mps     │   ├── ...     |   └── op_mps.mm     └── mlu        ├── ...        └── op_mlu.cpp ``` ## Components - `common`: This directory contains all tools and shared codes. - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax. - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**. - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device. - `utils`: The kernels and utils of spconv. - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory. - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory. - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops. - `cpu`: This directory contain cpu implementations of corresponding custom ops. - `mlu`: This directory contain launchers of each MLU kernels. - `mps`: MPS ops implementation and launchers. ## How to add new PyTorch ops? 1. (Optional) Add shared kernel in `common` to support special hardware platform. ```c++ // src/common/cuda/new_ops_cuda_kernel.cuh template __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) { // forward here } ``` Add cuda kernel launcher in `pytorch/cuda`. ```c++ // src/pytorch/cuda #include void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){ // initialize at::cuda::CUDAGuard device_guard(input.device()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); ... AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] { new_ops_forward_cuda_kernel <<>>( input.data_ptr(), output.data_ptr(),...); })); AT_CUDA_CHECK(cudaGetLastError()); } ``` 2. Register implementation for different devices. ```c++ // src/pytorch/cuda/cudabind.cpp ... Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){ // implement cuda forward here // use `NewOpsForwardCUDAKernelLauncher` here } // declare interface here. Tensor new_ops_forward_impl(Tensor input, Tensor output, ...); // register the implementation for given device (CUDA here). REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda); ``` 3. Add ops implementation in `pytorch` directory. Select different implementations according to device type. ```c++ // src/pytorch/new_ops.cpp Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){ // dispatch the implementation according to the device type of input. DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...); } ... Tensor new_ops_forward(Tensor input, Tensor output, ...){ return new_ops_forward_impl(input, output, ...); } ``` 4. Binding the implementation in `pytorch/pybind.cpp` ```c++ // src/pytorch/pybind.cpp ... Tensor new_ops_forward(Tensor input, Tensor output, ...); ... // bind with pybind11 m.def("new_ops_forward", &new_ops_forward, "new_ops_forward", py::arg("input"), py::arg("output"), ...); ... ``` 5. Build MMCV again. Enjoy new ops in python ```python from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['new_ops_forward']) ... ext_module.new_ops_forward(input, output, ...) ``` ================================================ FILE: mmcv/ops/csrc/common/box_iou_rotated_utils.hpp ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // modified from // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h #pragma once #include #include #if defined(__CUDACC__) || defined(__MUSACC__) // Designates functions callable from the host (CPU) and the device (GPU) #define HOST_DEVICE __host__ __device__ #define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ #else #include #define HOST_DEVICE #define HOST_DEVICE_INLINE HOST_DEVICE inline #endif namespace { template struct RotatedBox { T x_ctr, y_ctr, w, h, a; }; template struct Point { T x, y; HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {} HOST_DEVICE_INLINE Point operator+(const Point& p) const { return Point(x + p.x, y + p.y); } HOST_DEVICE_INLINE Point& operator+=(const Point& p) { x += p.x; y += p.y; return *this; } HOST_DEVICE_INLINE Point operator-(const Point& p) const { return Point(x - p.x, y - p.y); } HOST_DEVICE_INLINE Point operator*(const T coeff) const { return Point(x * coeff, y * coeff); } }; template HOST_DEVICE_INLINE T dot_2d(const Point& A, const Point& B) { return A.x * B.x + A.y * B.y; } template HOST_DEVICE_INLINE T cross_2d(const Point& A, const Point& B) { return A.x * B.y - B.x * A.y; } template HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox& box, Point (&pts)[4]) { // M_PI / 180. == 0.01745329251 // double theta = box.a * 0.01745329251; // MODIFIED double theta = box.a; T cosTheta2 = (T)cos(theta) * 0.5f; T sinTheta2 = (T)sin(theta) * 0.5f; // y: top --> down; x: left --> right pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w; pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w; pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; pts[2].x = 2 * box.x_ctr - pts[0].x; pts[2].y = 2 * box.y_ctr - pts[0].y; pts[3].x = 2 * box.x_ctr - pts[1].x; pts[3].y = 2 * box.y_ctr - pts[1].y; } template HOST_DEVICE_INLINE int get_intersection_points(const Point (&pts1)[4], const Point (&pts2)[4], Point (&intersections)[24]) { // Line vector // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] Point vec1[4], vec2[4]; for (int i = 0; i < 4; i++) { vec1[i] = pts1[(i + 1) % 4] - pts1[i]; vec2[i] = pts2[(i + 1) % 4] - pts2[i]; } // Line test - test all line combos for intersection int num = 0; // number of intersections for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { // Solve for 2x2 Ax=b T det = cross_2d(vec2[j], vec1[i]); // This takes care of parallel lines if (fabs(det) <= 1e-14) { continue; } auto vec12 = pts2[j] - pts1[i]; T t1 = cross_2d(vec2[j], vec12) / det; T t2 = cross_2d(vec1[i], vec12) / det; if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) { intersections[num++] = pts1[i] + vec1[i] * t1; } } } // Check for vertices of rect1 inside rect2 { const auto& AB = vec2[0]; const auto& DA = vec2[3]; auto ABdotAB = dot_2d(AB, AB); auto ADdotAD = dot_2d(DA, DA); for (int i = 0; i < 4; i++) { // assume ABCD is the rectangle, and P is the point to be judged // P is inside ABCD iff. P's projection on AB lies within AB // and P's projection on AD lies within AD auto AP = pts1[i] - pts2[0]; auto APdotAB = dot_2d(AP, AB); auto APdotAD = -dot_2d(AP, DA); if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) { intersections[num++] = pts1[i]; } } } // Reverse the check - check for vertices of rect2 inside rect1 { const auto& AB = vec1[0]; const auto& DA = vec1[3]; auto ABdotAB = dot_2d(AB, AB); auto ADdotAD = dot_2d(DA, DA); for (int i = 0; i < 4; i++) { auto AP = pts2[i] - pts1[0]; auto APdotAB = dot_2d(AP, AB); auto APdotAD = -dot_2d(AP, DA); if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <= ADdotAD)) { intersections[num++] = pts2[i]; } } } return num; } template HOST_DEVICE_INLINE int convex_hull_graham(const Point (&p)[24], const int& num_in, Point (&q)[24], bool shift_to_zero = false) { assert(num_in >= 2); // Step 1: // Find point with minimum y // if more than 1 points have the same minimum y, // pick the one with the minimum x. int t = 0; for (int i = 1; i < num_in; i++) { if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { t = i; } } auto& start = p[t]; // starting point // Step 2: // Subtract starting point from every points (for sorting in the next step) for (int i = 0; i < num_in; i++) { q[i] = p[i] - start; } // Swap the starting point to position 0 auto tmp = q[0]; q[0] = q[t]; q[t] = tmp; // Step 3: // Sort point 1 ~ num_in according to their relative cross-product values // (essentially sorting according to angles) // If the angles are the same, sort according to their distance to origin T dist[24]; for (int i = 0; i < num_in; i++) { dist[i] = dot_2d(q[i], q[i]); } #if defined(__CUDACC__) || defined(__MUSACC__) // CUDA version // In the future, we can potentially use thrust // for sorting here to improve speed (though not guaranteed) for (int i = 1; i < num_in - 1; i++) { for (int j = i + 1; j < num_in; j++) { T crossProduct = cross_2d(q[i], q[j]); if ((crossProduct < -1e-6) || (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { auto q_tmp = q[i]; q[i] = q[j]; q[j] = q_tmp; auto dist_tmp = dist[i]; dist[i] = dist[j]; dist[j] = dist_tmp; } } } #else // CPU version std::sort(q + 1, q + num_in, [](const Point& A, const Point& B) -> bool { T temp = cross_2d(A, B); if (fabs(temp) < 1e-6) { return dot_2d(A, A) < dot_2d(B, B); } else { return temp > 0; } }); // compute distance to origin after sort, since the points are now different. for (int i = 0; i < num_in; i++) { dist[i] = dot_2d(q[i], q[i]); } #endif // Step 4: // Make sure there are at least 2 points (that don't overlap with each other) // in the stack int k; // index of the non-overlapped second point for (k = 1; k < num_in; k++) { if (dist[k] > 1e-8) { break; } } if (k == num_in) { // We reach the end, which means the convex hull is just one point q[0] = p[t]; return 1; } q[1] = q[k]; int m = 2; // 2 points in the stack // Step 5: // Finally we can start the scanning process. // When a non-convex relationship between the 3 points is found // (either concave shape or duplicated points), // we pop the previous point from the stack // until the 3-point relationship is convex again, or // until the stack only contains two points for (int i = k + 1; i < num_in; i++) { while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) { m--; } q[m++] = q[i]; } // Step 6 (Optional): // In general sense we need the original coordinates, so we // need to shift the points back (reverting Step 2) // But if we're only interested in getting the area/perimeter of the shape // We can simply return. if (!shift_to_zero) { for (int i = 0; i < m; i++) { q[i] += start; } } return m; } template HOST_DEVICE_INLINE T quadri_box_area(const Point (&q)[4]) { T area = 0; #pragma unroll for (int i = 1; i < 3; i++) { area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); } return area / 2.0; } template HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int& m) { if (m <= 2) { return 0; } T area = 0; for (int i = 1; i < m - 1; i++) { area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); } return area / 2.0; } template HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox& box1, const RotatedBox& box2) { // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned // from rotated_rect_intersection_pts Point intersectPts[24], orderedPts[24]; Point pts1[4]; Point pts2[4]; get_rotated_vertices(box1, pts1); get_rotated_vertices(box2, pts2); int num = get_intersection_points(pts1, pts2, intersectPts); if (num <= 2) { return 0.0; } // Convex Hull to order the intersection points in clockwise order and find // the contour area. int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); return polygon_area(orderedPts, num_convex); } template HOST_DEVICE_INLINE T quadri_boxes_intersection(const Point (&pts1)[4], const Point (&pts2)[4]) { // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned // from rotated_rect_intersection_pts Point intersectPts[24], orderedPts[24]; int num = get_intersection_points(pts1, pts2, intersectPts); if (num <= 2) { return 0.0; } // Convex Hull to order the intersection points in clockwise order and find // the contour area. int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); return polygon_area(orderedPts, num_convex); } } // namespace template HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw, const int mode_flag) { // shift center to the middle point to achieve higher precision in result RotatedBox box1, box2; auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; box1.x_ctr = box1_raw[0] - center_shift_x; box1.y_ctr = box1_raw[1] - center_shift_y; box1.w = box1_raw[2]; box1.h = box1_raw[3]; box1.a = box1_raw[4]; box2.x_ctr = box2_raw[0] - center_shift_x; box2.y_ctr = box2_raw[1] - center_shift_y; box2.w = box2_raw[2]; box2.h = box2_raw[3]; box2.a = box2_raw[4]; const T area1 = box1.w * box1.h; const T area2 = box2.w * box2.h; if (area1 < 1e-14 || area2 < 1e-14) { return 0.f; } const T intersection = rotated_boxes_intersection(box1, box2); T baseS = 1.0; if (mode_flag == 0) { baseS = (area1 + area2 - intersection); } else if (mode_flag == 1) { baseS = area1; } const T iou = intersection / baseS; return iou; } template HOST_DEVICE_INLINE T single_box_iou_quadri(T const* const pts1_raw, T const* const pts2_raw, const int mode_flag) { // shift center to the middle point to achieve higher precision in result Point pts1[4], pts2[4]; auto center_shift_x = (pts1_raw[0] + pts2_raw[0] + pts1_raw[2] + pts2_raw[2] + pts1_raw[4] + pts2_raw[4] + pts1_raw[6] + pts2_raw[6]) / 8.0; auto center_shift_y = (pts1_raw[1] + pts2_raw[1] + pts1_raw[3] + pts2_raw[3] + pts1_raw[5] + pts2_raw[5] + pts1_raw[7] + pts2_raw[7]) / 8.0; pts1[0].x = pts1_raw[0] - center_shift_x; pts1[0].y = pts1_raw[1] - center_shift_y; pts1[1].x = pts1_raw[2] - center_shift_x; pts1[1].y = pts1_raw[3] - center_shift_y; pts1[2].x = pts1_raw[4] - center_shift_x; pts1[2].y = pts1_raw[5] - center_shift_y; pts1[3].x = pts1_raw[6] - center_shift_x; pts1[3].y = pts1_raw[7] - center_shift_y; pts2[0].x = pts2_raw[0] - center_shift_x; pts2[0].y = pts2_raw[1] - center_shift_y; pts2[1].x = pts2_raw[2] - center_shift_x; pts2[1].y = pts2_raw[3] - center_shift_y; pts2[2].x = pts2_raw[4] - center_shift_x; pts2[2].y = pts2_raw[5] - center_shift_y; pts2[3].x = pts2_raw[6] - center_shift_x; pts2[3].y = pts2_raw[7] - center_shift_y; const T area1 = quadri_box_area(pts1); const T area2 = quadri_box_area(pts2); if (area1 < 1e-14 || area2 < 1e-14) { return 0.f; } const T intersection = quadri_boxes_intersection(pts1, pts2); T baseS = 1.0; if (mode_flag == 0) { baseS = (area1 + area2 - intersection); } else if (mode_flag == 1) { baseS = area1; } const T iou = intersection / baseS; return iou; } ================================================ FILE: mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu #ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH #define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void active_rotated_filter_forward_cuda_kernel( const int nthreads, const scalar_t* weight_data, const int* indices_data, const int num_input_planes, const int num_output_planes, const int num_orientations, const int num_rotations, const int nEntry, scalar_t* output_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int l = index % nEntry; int j = (index / nEntry) % num_input_planes; int i = index / nEntry / num_input_planes; int k; scalar_t val = *(weight_data + index); for (k = 0; k < num_rotations; k++) { int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; scalar_t* target = output_data + i * (num_rotations * num_input_planes * nEntry) + k * (num_input_planes * nEntry) + j * (nEntry) + idx; *target = val; } } } template __global__ void active_rotated_filter_backward_cuda_kernel( const int nthreads, const scalar_t* gradWeight_data, const int* indices_data, const int num_input_planes, const int num_output_planes, const int num_orientations, const int num_rotations, const int nEntry, scalar_t* weight_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int l = index % nEntry; int j = (index / nEntry) % num_input_planes; int i = index / nEntry / num_input_planes; int k; scalar_t* val = weight_data + index; *val = 0; scalar_t tmp = 0; for (k = 0; k < num_rotations; k++) { int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; scalar_t target = *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) + k * (num_input_planes * nEntry) + j * (nEntry) + idx); tmp = tmp + target; } *val = tmp; } } #endif // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH #define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif // input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K) // output: fout(B,O,N) // algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) = s(b,i,k,m)*p(b,i(k),m,j) // i(k) = idx(b,i,k) // sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j) // avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k // max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j))) template __global__ void assign_score_withk_forward_cuda_kernel( const int B, const int N0, const int N1, const int M, const int K, const int O, const int aggregate, const T* points, const T* centers, const T* scores, const int64_t* knn_idx, T* output) { // ----- parallel loop for B, N1, K and O --------- CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) { // ------- loop for M ---------- const int b = (int)(i / (O * N1 * K)); const int o = (int)(i % (O * N1 * K) / (N1 * K)); const int n = (int)(i % (N1 * K) / K); const int k = (int)(i % K); const int cn = (int)knn_idx[b * K * N1 + n * K + 0]; // The first neighbor is the center point const int kn = (int)knn_idx[b * K * N1 + n * K + k]; if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range return; } assert(b < B); assert(kn < N0); assert(cn < N0); assert(o < O); assert(n < N1); const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k; T val = output[out_idx]; for (int m = 0; m < M; m++) { val += points[b * N0 * M * O + kn * M * O + m * O + o] * scores[b * N1 * K * M + n * K * M + k * M + m] - centers[b * N0 * M * O + cn * M * O + m * O + o] * scores[b * N1 * K * M + n * K * M + k * M + m]; } output[out_idx] = val; } } template __global__ void assign_score_withk_points_backward_cuda_kernel( const int B, const int N0, const int N, const int M, const int K, const int O, const int aggregate, const T* grad_out, const T* scores, const int64_t* knn_idx, T* grad_points, T* grad_centers) { // ----- parallel loop for B, M, O --------- CUDA_1D_KERNEL_LOOP(i, B * M * O) { int b = (int)(i / (M * O)); int m = (int)(i % (M * O) / O); int o = (int)(i % O); // ----- loop for N,K --------- for (int n = 0; n < N; n++) { for (int k = 0; k < K; k++) { int kn = knn_idx[b * N * K + n * K + k]; int cn = knn_idx[b * N * K + n * K + 0]; if (kn >= N0 || kn < 0) { // if index overflows, it is out of the // neighborhood range continue; } atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o, scores[b * N * K * M + n * K * M + k * M + m] * grad_out[b * O * N * K + o * N * K + n * K + k]); atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o, -scores[b * N * K * M + n * K * M + k * M + m] * grad_out[b * O * N * K + o * N * K + n * K + k]); } } } } template __global__ void assign_score_withk_scores_backward_cuda_kernel( const int B, const int N0, const int N, const int M, const int K, const int O, const int aggregate, const T* grad_out, const T* points, const T* centers, const int64_t* knn_idx, T* grad_scores) { // ----- parallel loop for B, N, K, M --------- CUDA_1D_KERNEL_LOOP(i, B * N * K * M) { const int b = (int)(i / (N * M * K)); const int n = (int)(i % (N * M * K) / M / K); const int k = (int)(i % (M * K) / M); const int m = (int)(i % M); const int cn = knn_idx[b * N * K + n * K + 0]; const int kn = knn_idx[b * N * K + n * K + k]; if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range return; } // -------------- loop for O ------------------------ const int out_idx = b * N * K * M + n * K * M + k * M + m; T val = grad_scores[out_idx]; for (int o = 0; o < O; o++) { val += (points[b * N0 * M * O + kn * M * O + m * O + o] - centers[b * N0 * M * O + cn * M * O + m * O + o]) * grad_out[b * O * N * K + o * N * K + n * K + k]; } grad_scores[out_idx] = val; } } #endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu #ifndef BALL_QUERY_CUDA_KERNEL_CUH #define BALL_QUERY_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void ball_query_forward_cuda_kernel(int b, int n, int m, float min_radius, float max_radius, int nsample, const T* new_xyz, const T* xyz, int* idx) { // new_xyz: (B, M, 3) // xyz: (B, N, 3) // output: // idx: (B, M, nsample) int bs_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, m) { if (bs_idx >= b) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; idx += bs_idx * m * nsample + pt_idx * nsample; float max_radius2 = max_radius * max_radius; float min_radius2 = min_radius * min_radius; T new_x = new_xyz[0]; T new_y = new_xyz[1]; T new_z = new_xyz[2]; int cnt = 0; for (int k = 0; k < n; ++k) { T x = xyz[k * 3 + 0]; T y = xyz[k * 3 + 1]; T z = xyz[k * 3 + 2]; T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { if (cnt == 0) { for (int l = 0; l < nsample; ++l) { idx[l] = k; } } idx[cnt] = k; ++cnt; if (cnt >= nsample) break; } } } } #endif // BALL_QUERY_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH #define BBOX_OVERLAPS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1, T& y1, T& x2, T& y2) { x1 = bbox[base]; y1 = bbox[base + 1]; x2 = bbox[base + 2]; y2 = bbox[base + 3]; } template <> __device__ __forceinline__ void load_bbox(const float* bbox, const int base, float& x1, float& y1, float& x2, float& y2) { const float4 bbox_offset = reinterpret_cast(bbox + base)[0]; x1 = bbox_offset.x; y1 = bbox_offset.y; x2 = bbox_offset.z; y2 = bbox_offset.w; } template __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2, T* ious, const int num_bbox1, const int num_bbox2, const int mode, const bool aligned, const int offset) { if (aligned) { CUDA_1D_KERNEL_LOOP(index, num_bbox1) { const int b1 = index; const int b2 = index; const int base1 = b1 << 2; // b1 * 4 T b1_x1, b1_y1, b1_x2, b1_y2; load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); const int base2 = b2 << 2; // b2 * 4 T b2_x1, b2_y1, b2_x2, b2_y2; load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); const T width = fmaxf(right - left + offset, 0.f); const T height = fmaxf(bottom - top + offset, 0.f); const T interS = width * height; const T baseS = fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); ious[index] = interS / baseS; } } else { CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) { const int b1 = index / num_bbox2; const int b2 = index % num_bbox2; const int base1 = b1 << 2; // b1 * 4 T b1_x1, b1_y1, b1_x2, b1_y2; load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); const int base2 = b2 << 2; // b2 * 4 T b2_x1, b2_y1, b2_x2, b2_y2; load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); const T width = fmaxf(right - left + offset, 0.f); const T height = fmaxf(bottom - top + offset, 0.f); const T interS = width * height; const T baseS = fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); ious[index] = interS / baseS; } } } #if __CUDA_ARCH__ >= 530 __device__ __forceinline__ __half __half_area(const __half x1, const __half y1, const __half x2, const __half y2, const __half offset) { const __half half_w = __hadd(__hsub(x2, x1), offset); const __half half_h = __hadd(__hsub(y2, y1), offset); return __hmul(half_w, half_h); } __device__ __forceinline__ __half __half_max(const __half a, const __half b) { return __hge(a, b) ? a : b; } __device__ __forceinline__ __half __half_min(const __half a, const __half b) { return __hle(a, b) ? a : b; } // fp16 won't provide much increase when aligned==true. It is useful when // aligned==false, which would give you ~40% bonus. __device__ void bbox_overlaps_cuda_kernel_half( const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1, const int num_bbox2, const int mode, const bool aligned, const int offset) { const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2; const __half h_offset = __int2half_rn(offset); CUDA_1D_KERNEL_LOOP(index, num_output) { const int b1 = aligned ? index : index / num_bbox2; const int b2 = aligned ? index : index % num_bbox2; const int base1 = b1 << 2; __half b1_x1, b1_y1, b1_x2, b1_y2; load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset); const int base2 = b2 << 2; __half b2_x1, b2_y1, b2_x2, b2_y2; load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset); const __half left = __half_max(b1_x1, b2_x1), right = __half_min(b1_x2, b2_x2); const __half top = __half_max(b1_y1, b2_y1), bottom = __half_min(b1_y2, b2_y2); const __half width = __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f)); const __half height = __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f)); const __half interS = __hmul(width, height); const __half baseS = __half_max( mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area, h_offset); ious[index] = __hdiv(interS, baseS); } } #endif // __CUDA_ARCH__ >= 530 #endif // BBOX_OVERLAPS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/bezier_align_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu #ifndef BEZIER_ALIGN_CUDA_KERNEL_CUH #define BEZIER_ALIGN_CUDA_KERNEL_CUH #include #ifdef MMCV_WITH_TRT #include "common_cuda_helper.hpp" #else // MMCV_WITH_TRT #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else // MMCV_USE_PARROTS #include "pytorch_cuda_helper.hpp" #endif // MMCV_USE_PARROTS #endif // MMCV_WITH_TRT template __device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3, const T u) { return ((1. - u) * (1. - u) * (1. - u) * p0 + 3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 + u * u * u * p3); } template __global__ void bezier_align_forward_cuda_kernel( const int nthreads, const T *bottom_data, // inputs const T *bottom_rois, // bottom rois contains the bezier curve T *top_data, // outputs const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, bool aligned, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; // beziers have size Nx(1+8*2) = Nx17 const T *offset_bottom_rois = bottom_rois + n * 17; int roi_batch_ind = offset_bottom_rois[0]; // Do not use rounding; this implementation detail is critical T offset = aligned ? (T)0.5 : (T)0.0; // TODO: avoid this by using parallel annotation, for good T p0_x = offset_bottom_rois[1] * spatial_scale; T p0_y = offset_bottom_rois[2] * spatial_scale; T p1_x = offset_bottom_rois[3] * spatial_scale; T p1_y = offset_bottom_rois[4] * spatial_scale; T p2_x = offset_bottom_rois[5] * spatial_scale; T p2_y = offset_bottom_rois[6] * spatial_scale; T p3_x = offset_bottom_rois[7] * spatial_scale; T p3_y = offset_bottom_rois[8] * spatial_scale; T p4_x = offset_bottom_rois[15] * spatial_scale; T p4_y = offset_bottom_rois[16] * spatial_scale; T p5_x = offset_bottom_rois[13] * spatial_scale; T p5_y = offset_bottom_rois[14] * spatial_scale; T p6_x = offset_bottom_rois[11] * spatial_scale; T p6_y = offset_bottom_rois[12] * spatial_scale; T p7_x = offset_bottom_rois[9] * spatial_scale; T p7_y = offset_bottom_rois[10] * spatial_scale; // compute the coords const T u = pw / static_cast(pooled_width); const T v = ph / static_cast(pooled_height); const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u); const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u); const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u); const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u); const T x_center = x1 * v + x0 * (1. - v) - offset; const T y_center = y1 * v + y0 * (1. - v) - offset; T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x)); T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y)); if (!aligned) { // for backward-compatibility only roi_width = max(roi_width, (T)1.); roi_height = max(roi_height, (T)1.); } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin // When the grid is empty, output zeros == 0/1, instead of NaN. const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 { const T y = y_center - (T)0.5 * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = x_center - (T)0.5 * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index); output_val += val; } } output_val /= count; top_data[index] = output_val; } } template __global__ void bezier_align_backward_cuda_kernel( const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, bool aligned, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; // beziers have size Nx(1+8*2) = Nx17 const T *offset_bottom_rois = bottom_rois + n * 17; int roi_batch_ind = offset_bottom_rois[0]; // Do not use rounding; this implementation detail is critical T offset = aligned ? (T)0.5 : (T)0.0; T p0_x = offset_bottom_rois[1] * spatial_scale; T p0_y = offset_bottom_rois[2] * spatial_scale; T p1_x = offset_bottom_rois[3] * spatial_scale; T p1_y = offset_bottom_rois[4] * spatial_scale; T p2_x = offset_bottom_rois[5] * spatial_scale; T p2_y = offset_bottom_rois[6] * spatial_scale; T p3_x = offset_bottom_rois[7] * spatial_scale; T p3_y = offset_bottom_rois[8] * spatial_scale; T p4_x = offset_bottom_rois[15] * spatial_scale; T p4_y = offset_bottom_rois[16] * spatial_scale; T p5_x = offset_bottom_rois[13] * spatial_scale; T p5_y = offset_bottom_rois[14] * spatial_scale; T p6_x = offset_bottom_rois[11] * spatial_scale; T p6_y = offset_bottom_rois[12] * spatial_scale; T p7_x = offset_bottom_rois[9] * spatial_scale; T p7_y = offset_bottom_rois[10] * spatial_scale; // compute the coords const T u = pw / static_cast(pooled_width); const T v = ph / static_cast(pooled_height); const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u); const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u); const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u); const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u); const T x_center = x1 * v + x0 * (1. - v) - offset; const T y_center = y1 * v + y0 * (1. - v) - offset; T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x)); T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y)); if (!aligned) { // for backward-compatibility only roi_width = max(roi_width, (T)1.); roi_height = max(roi_height, (T)1.); } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); T *offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T *offset_top_diff = top_diff + top_offset; const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 { const T y = y_center - (T)0.5 * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = x_center - (T)0.5 * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); T g1 = top_diff_this_bin * w1 / count; T g2 = top_diff_this_bin * w2 / count; T g3 = top_diff_this_bin * w3 / count; T g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1)); atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2)); atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3)); atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4)); } // if } // ix } // iy } // CUDA_1D_KERNEL_LOOP } // BezierAlignBackward #endif // BEZIER_ALIGN_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved // modified from // https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu. // the main difference: (1) use `argmax_idx` for fast computing of gradient // during the backward. (2) `wh` is directly computed by `boxes`, rather than // passing it as argument to forward or backward functions. #ifndef BORDER_ALIGN_CUDA_KERNEL_CUH #define BORDER_ALIGN_CUDA_KERNEL_CUH #include #ifdef MMCV_WITH_TRT #include "common_cuda_helper.hpp" #else // MMCV_WITH_TRT #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else // MMCV_USE_PARROTS #include "pytorch_cuda_helper.hpp" #endif // MMCV_USE_PARROTS #endif // MMCV_WITH_TRT enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 }; /*** Forward ***/ template __global__ void border_align_forward_cuda_kernel( const int nthreads, const T* input, const T* boxes, T* output, int* argmax_idx, const int channels, const int box_size, const int height, const int width, const int pool_size) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (batch_idx, c_idx, box_idx) is an element paralleled for computing // output, and `extreme_idx` is in range [0,3] int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx; const T *offset_box, *offset_input, *offset_box_x; T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y, val, maxval; extreme_idx = threadIdx.y; // shape (N, C, box_size, 4) for output batch_idx = index / channels / box_size; // shape (N, box_size, 4) for boxes box_idx = index % box_size + batch_idx * box_size; c_idx = (index / box_size) % channels; offset_box = boxes + box_idx * 4; box_width = *(offset_box + 2) - *offset_box; box_height = *(offset_box + 3) - *(offset_box + 1); offset_output = output + index * 4 + extreme_idx; offset_argmax_idx = argmax_idx + index * 4 + extreme_idx; // shape (N, 4C, h, w) for input. // [0,C) for top feature, [C,2C) for left feature, // [2C,3C) for bottom feature, [3C,4C) for right feature offset_input = input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) * height * width; // extreme_idx in [0,1] -> offset_box_x indexed at x1 // extreme_idx in [2,3] -> offset_box_x indexed at x2 offset_box_x = offset_box + extreme_idx / 2 * 2; // (x1,y1) or (x2,y2) for (x,y) x = *offset_box_x; y = *(offset_box_x + 1); switch (extreme_idx) { // top case BorderMode::Top: stride = box_width / pool_size; x_stride = stride; y_stride = 0; break; // left case BorderMode::Left: stride = box_height / pool_size; x_stride = 0; y_stride = stride; break; // bottom case BorderMode::Bottom: stride = box_width / pool_size; x_stride = -stride; y_stride = 0; break; // right case BorderMode::Right: stride = box_height / pool_size; x_stride = 0; y_stride = -stride; break; } // initialize maxval and maxidx with the start position (e.g. (x1,y1) or // (x2,y2)) maxval = bilinear_interpolate(offset_input, height, width, y, x, index); maxidx = 0; // do max_pool along the border for (int i = 1; i <= pool_size; i++) { x += x_stride; y += y_stride; val = bilinear_interpolate(offset_input, height, width, y, x, index); if (val > maxval) { maxval = val; maxidx = i; } } // update output and argmax_idx *offset_output = maxval; *offset_argmax_idx = maxidx; } } /*** Backward ***/ template __global__ void border_align_backward_cuda_kernel( const int nthreads, const T* grad_output, const T* boxes, const int* argmax_idx, T* grad_input, const int channels, const int box_size, const int height, const int width, const int pool_size) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (batch_idx, c_idx, box_idx) is an element paralleled for computing // output, and `extreme_idx` is in range [0,3] int batch_idx, c_idx, box_idx, extreme_idx; const int* offset_argmax_idx; const T *offset_grad_output, *offset_box, *offset_box_x; T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x, y; extreme_idx = threadIdx.y; batch_idx = index / channels / box_size; box_idx = index % box_size + batch_idx * box_size; c_idx = (index / box_size) % channels; offset_box = boxes + box_idx * 4; box_width = *(offset_box + 2) - *offset_box; box_height = *(offset_box + 3) - *(offset_box + 1); offset_grad_output = grad_output + index * 4 + extreme_idx; offset_argmax_idx = argmax_idx + index * 4 + extreme_idx; // [0,C) for top feature grad, [C,2C) for left feature grad, // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad offset_grad_input = grad_input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) * height * width; // extreme_idx in [0,1] -> offset_box_x indexed at x1 // extreme_idx in [2,3] -> offset_box_x indexed at x2 offset_box_x = offset_box + extreme_idx / 2 * 2; switch (extreme_idx) { // top case BorderMode::Top: stride = box_width / pool_size; x_stride = stride; y_stride = 0; break; // left case BorderMode::Left: stride = box_height / pool_size; x_stride = 0; y_stride = stride; break; // bottom case BorderMode::Bottom: stride = box_width / pool_size; x_stride = -stride; y_stride = 0; break; // right case BorderMode::Right: stride = box_height / pool_size; x_stride = 0; y_stride = -stride; break; } // get position (x,y) which has maximum value during forward x = *offset_box_x; y = *(offset_box_x + 1); x += x_stride * (T)(*offset_argmax_idx); y += y_stride * (T)(*offset_argmax_idx); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); // update grad_output atomicAdd(offset_grad_input + y_low * width + x_low, *offset_grad_output * w1); atomicAdd(offset_grad_input + y_low * width + x_high, *offset_grad_output * w2); atomicAdd(offset_grad_input + y_high * width + x_low, *offset_grad_output * w3); atomicAdd(offset_grad_input + y_high * width + x_high, *offset_grad_output * w4); } } #endif // BORDER_ALIGN_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/box_iou_quadri_cuda.cuh ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved #ifndef BOX_IOU_QUADRI_CUDA_CUH #define BOX_IOU_QUADRI_CUDA_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #include "box_iou_rotated_utils.hpp" // 2D block with 32 * 16 = 512 threads per block const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 16; inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } template __global__ void box_iou_quadri_cuda_kernel( const int n_boxes1, const int n_boxes2, const T* dev_boxes1, const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) { if (aligned) { CUDA_1D_KERNEL_LOOP(index, n_boxes1) { int b1 = index; int b2 = index; int base1 = b1 * 8; float block_boxes1[8]; float block_boxes2[8]; block_boxes1[0] = dev_boxes1[base1 + 0]; block_boxes1[1] = dev_boxes1[base1 + 1]; block_boxes1[2] = dev_boxes1[base1 + 2]; block_boxes1[3] = dev_boxes1[base1 + 3]; block_boxes1[4] = dev_boxes1[base1 + 4]; block_boxes1[5] = dev_boxes1[base1 + 5]; block_boxes1[6] = dev_boxes1[base1 + 6]; block_boxes1[7] = dev_boxes1[base1 + 7]; int base2 = b2 * 8; block_boxes2[0] = dev_boxes2[base2 + 0]; block_boxes2[1] = dev_boxes2[base2 + 1]; block_boxes2[2] = dev_boxes2[base2 + 2]; block_boxes2[3] = dev_boxes2[base2 + 3]; block_boxes2[4] = dev_boxes2[base2 + 4]; block_boxes2[5] = dev_boxes2[base2 + 5]; block_boxes2[6] = dev_boxes2[base2 + 6]; block_boxes2[7] = dev_boxes2[base2 + 7]; dev_ious[index] = single_box_iou_quadri(block_boxes1, block_boxes2, mode_flag); } } else { CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) { int b1 = index / n_boxes2; int b2 = index % n_boxes2; int base1 = b1 * 8; float block_boxes1[8]; float block_boxes2[8]; block_boxes1[0] = dev_boxes1[base1 + 0]; block_boxes1[1] = dev_boxes1[base1 + 1]; block_boxes1[2] = dev_boxes1[base1 + 2]; block_boxes1[3] = dev_boxes1[base1 + 3]; block_boxes1[4] = dev_boxes1[base1 + 4]; block_boxes1[5] = dev_boxes1[base1 + 5]; block_boxes1[6] = dev_boxes1[base1 + 6]; block_boxes1[7] = dev_boxes1[base1 + 7]; int base2 = b2 * 8; block_boxes2[0] = dev_boxes2[base2 + 0]; block_boxes2[1] = dev_boxes2[base2 + 1]; block_boxes2[2] = dev_boxes2[base2 + 2]; block_boxes2[3] = dev_boxes2[base2 + 3]; block_boxes2[4] = dev_boxes2[base2 + 4]; block_boxes2[5] = dev_boxes2[base2 + 5]; block_boxes2[6] = dev_boxes2[base2 + 6]; block_boxes2[7] = dev_boxes2[base2 + 7]; dev_ious[index] = single_box_iou_quadri(block_boxes1, block_boxes2, mode_flag); } } } #endif ================================================ FILE: mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // modified from // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu #ifndef BOX_IOU_ROTATED_CUDA_CUH #define BOX_IOU_ROTATED_CUDA_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #include "box_iou_rotated_utils.hpp" // 2D block with 32 * 16 = 512 threads per block const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 16; inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } template __global__ void box_iou_rotated_cuda_kernel( const int n_boxes1, const int n_boxes2, const T* dev_boxes1, const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) { if (aligned) { CUDA_1D_KERNEL_LOOP(index, n_boxes1) { int b1 = index; int b2 = index; int base1 = b1 * 5; float block_boxes1[5]; float block_boxes2[5]; block_boxes1[0] = dev_boxes1[base1 + 0]; block_boxes1[1] = dev_boxes1[base1 + 1]; block_boxes1[2] = dev_boxes1[base1 + 2]; block_boxes1[3] = dev_boxes1[base1 + 3]; block_boxes1[4] = dev_boxes1[base1 + 4]; int base2 = b2 * 5; block_boxes2[0] = dev_boxes2[base2 + 0]; block_boxes2[1] = dev_boxes2[base2 + 1]; block_boxes2[2] = dev_boxes2[base2 + 2]; block_boxes2[3] = dev_boxes2[base2 + 3]; block_boxes2[4] = dev_boxes2[base2 + 4]; dev_ious[index] = single_box_iou_rotated(block_boxes1, block_boxes2, mode_flag); } } else { CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) { int b1 = index / n_boxes2; int b2 = index % n_boxes2; int base1 = b1 * 5; float block_boxes1[5]; float block_boxes2[5]; block_boxes1[0] = dev_boxes1[base1 + 0]; block_boxes1[1] = dev_boxes1[base1 + 1]; block_boxes1[2] = dev_boxes1[base1 + 2]; block_boxes1[3] = dev_boxes1[base1 + 3]; block_boxes1[4] = dev_boxes1[base1 + 4]; int base2 = b2 * 5; block_boxes2[0] = dev_boxes2[base2 + 0]; block_boxes2[1] = dev_boxes2[base2 + 1]; block_boxes2[2] = dev_boxes2[base2 + 2]; block_boxes2[3] = dev_boxes2[base2 + 3]; block_boxes2[4] = dev_boxes2[base2 + 4]; dev_ious[index] = single_box_iou_rotated(block_boxes1, block_boxes2, mode_flag); } } } #endif ================================================ FILE: mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef CARAFE_CUDA_KERNEL_CUH #define CARAFE_CUDA_KERNEL_CUH #include #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #ifdef MMCV_WITH_HIP #define WARP_SIZE 64 #else #define WARP_SIZE 32 #endif #define THREADS_PER_PIXEL 32 #define MAX_SHARED_MEMORY 49152 #define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144 #define MAXIMIZE_KERNEL_SIZE true #define kTileDim 32 #define kBlockRows 8 #define FULL_MASK 0xffffffff inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } __device__ inline int Loc2Index(const int n, const int c, const int h, const int w, const int channel_num, const int height, const int width) { int index = w + (h + (c + n * channel_num) * height) * width; return index; } #ifndef MMCV_WITH_HIP /* TODO: move this to a common place */ template __device__ inline scalar_t min(scalar_t a, scalar_t b) { return a < b ? a : b; } template __device__ inline scalar_t max(scalar_t a, scalar_t b) { return a > b ? a : b; } #endif template __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) { for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) #ifdef MMCV_WITH_HIP val += __shfl_down(val, offset); #else val += __shfl_down_sync(FULL_MASK, val, offset); #endif return val; } template <> __device__ __forceinline__ phalf warpReduceSum(phalf val) { for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) #ifdef MMCV_WITH_HIP // Using PyTorch's macro for half support __PHALF(val) += WARP_SHFL_DOWN(val, offset); #else __PHALF(val) += __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset); #endif return val; } // Splits the original matrix into submatrices with size 32 * 32. // Each block transposes one submatrix by loading it into shared memory. // Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/ template __global__ void BatchTranspose2DCUDAKernel(const int N, const int H, const int W, const int dh, const int dw, const scalar_t *__restrict__ X, scalar_t *__restrict__ Y) { __shared__ scalar_t tile[kTileDim][kTileDim + 1]; const int n = blockIdx.x / (dh * dw); const int k = blockIdx.x % (dh * dw); const int r = k / dw; const int c = k % dw; const int offset = n * H * W; int x = c * kTileDim + threadIdx.x; int y = r * kTileDim + threadIdx.y; if (x < W) { for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) { tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x]; } } __syncthreads(); x = r * kTileDim + threadIdx.x; y = c * kTileDim + threadIdx.y; if (x < H) { for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) { Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i]; } } } template __global__ void CARAFEForward( const int num_kernels, const scalar_t *__restrict__ bottom_data, const scalar_t *__restrict__ bottom_masks, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int down_height, const int down_width, const int height, const int width, const int mask_channels, scalar_t *__restrict__ top_data) { #if MAXIMIZE_KERNEL_SIZE __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2]; #else __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T]; #endif int index = threadIdx.x + blockIdx.x * blockDim.x; if (index > num_kernels - 1) { return; } const int pixel_id = threadIdx.x / THREADS_PER_PIXEL; const int split_id = threadIdx.x % THREADS_PER_PIXEL; index = index / THREADS_PER_PIXEL; const int pw = index % width; const int ph = (index / width) % height; const int n = index / width / height; const int down_pw = pw / scale_factor; const int down_ph = ph / scale_factor; const int start_w = down_pw - (kernel_size - 1) / 2; const int end_w = down_pw + (kernel_size - 1) / 2 + 1; const int start_h = down_ph - (kernel_size - 1) / 2; const int end_h = down_ph + (kernel_size - 1) / 2 + 1; for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) { int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels); shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index]; } __syncthreads(); const int channels_per_group = ceilf(channels / (float)group_size); #pragma unroll for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { int mask_group = c / channels_per_group; scalar_t output_val = 0; #pragma unroll for (int iy = start_h; iy < end_h; iy++) { #pragma unroll for (int ix = start_w; ix < end_w; ix++) { if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { continue; } int mask_iy = iy - down_ph + (kernel_size - 1) / 2; int mask_ix = ix - down_pw + (kernel_size - 1) / 2; int mask_c = (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; int feat_index = Loc2Index(n, iy, ix, c, down_height, down_width, channels); output_val += bottom_data[feat_index] * shared_mask[mask_c * WARP_SIZE + pixel_id]; } } int top_index = Loc2Index(n, ph, pw, c, height, width, channels); top_data[top_index] = output_val; } } template __global__ void CARAFEBackward_Feature( const int num_kernels, const scalar_t *__restrict__ top_diff, const scalar_t *__restrict__ bottom_masks, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int down_height, const int down_width, const int height, const int width, const int mask_channels, scalar_t *__restrict__ bottom_diff) { #if MAXIMIZE_KERNEL_SIZE __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2]; #else __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T]; #endif int index = threadIdx.x + blockIdx.x * blockDim.x; if (index > num_kernels - 1) { return; } const int pixel_id = threadIdx.x / THREADS_PER_PIXEL; const int split_id = threadIdx.x % THREADS_PER_PIXEL; // (n, c, ph, pw) is an element in the bottom_data index = index / THREADS_PER_PIXEL; const int pw = index % width; const int ph = (index / width) % height; const int n = index / width / height; const int start_w = pw - (kernel_size - 1) * scale_factor / 2; const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1; const int start_h = ph - (kernel_size - 1) * scale_factor / 2; const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1; for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) { const int mask_w = (c % kernel_size) * scale_factor; const int mask_h = (c / kernel_size % kernel_size) * scale_factor; const int mask_x = start_w + mask_w; const int mask_y = start_h + mask_h; if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) { shared_mask[c * WARP_SIZE + pixel_id] = 0; continue; } const int mask_group = c / (kernel_size * kernel_size); const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1; int mask_index = Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width); shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index]; } __syncthreads(); const int channels_per_group = ceilf(channels / (float)group_size); #pragma unroll for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { int mask_group = c / channels_per_group; int top_index = Loc2Index(n, ph, pw, c, height, width, channels); scalar_t output_val = 0; #pragma unroll for (int iy = start_h; iy < end_h; iy += scale_factor) { #pragma unroll for (int ix = start_w; ix < end_w; ix += scale_factor) { if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) { continue; } int mask_iy = (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor; int mask_ix = (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor; int mask_c = (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; int feat_index = Loc2Index(n, iy, ix, c, height, width, channels); output_val += shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index]; } } bottom_diff[top_index] = output_val; } } template __global__ void FeatureSum(const int num_kernels, const scalar_t *__restrict__ input_data, const int scale_factor, const int channels, const int height, const int width, scalar_t *__restrict__ output_data) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index > num_kernels - 1) { return; } const int split_id = threadIdx.x % THREADS_PER_PIXEL; index = index / THREADS_PER_PIXEL; const int pw = index % width; const int ph = (index / width) % height; const int n = index / width / height; for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { scalar_t output_val = 0; for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) { for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) { int input_id = Loc2Index(n, iy, ix, c, height * scale_factor, width * scale_factor, channels); output_val += input_data[input_id]; } } const int output_id = Loc2Index(n, ph, pw, c, height, width, channels); output_data[output_id] = output_val; } } template __global__ void CARAFEBackward_Mask(const int num_kernels, const scalar_t *__restrict__ top_diff, const scalar_t *__restrict__ bottom_data, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int down_height, const int down_width, const int height, const int width, const int mask_channels, scalar_t *__restrict__ mask_diff) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index > num_kernels - 1) { return; } const int lane_id = index % WARP_SIZE; index = index / WARP_SIZE; const int mask_c = index % mask_channels; // (n, c, ph, pw) is an element in the bottom_data index = index / mask_channels; const int pw = index % width; const int ph = (index / width) % height; const int n = index / width / height; const int down_pw = pw / scale_factor; const int down_ph = ph / scale_factor; const int mask_group = mask_c / (kernel_size * kernel_size); const int mask_loc = mask_c % (kernel_size * kernel_size); const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2; const int offset_y = mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2; const int down_x = down_pw + offset_x; const int down_y = down_ph + offset_y; scalar_t output_val = 0; if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 && down_x <= down_width - 1) { const int channels_per_mask = ceilf(channels / (float)group_size); const int start = channels_per_mask * mask_group; const int end = min(channels_per_mask * (mask_group + 1), channels); for (int c = start + lane_id; c < end; c += WARP_SIZE) { int bottom_id = Loc2Index(n, down_y, down_x, c, down_height, down_width, channels); int top_id = Loc2Index(n, ph, pw, c, height, width, channels); output_val += top_diff[top_id] * bottom_data[bottom_id]; } } #ifdef MMCV_WITH_HIP __syncthreads(); #else __syncwarp(); #endif output_val = warpReduceSum(output_val); if (lane_id == 0) { const int mask_id = Loc2Index(n, ph, pw, mask_c, height, width, mask_channels); mask_diff[mask_id] = output_val; } } #endif // CARAFE_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH #define CARAFE_NAIVE_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif __device__ inline int Loc2Index(const int n, const int c, const int h, const int w, const int channel_num, const int height, const int width) { int index = w + (h + (c + n * channel_num) * height) * width; return index; } template __global__ void carafe_naive_forward_cuda_kernel( const int nthreads, const scalar_t *bottom_data, const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the bottom_data int pw = index % width; int ph = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; int mask_channels = kernel_size * kernel_size * group_size; int mask_group = c / (channels / group_size); int down_pw = pw / scale_factor; int down_ph = ph / scale_factor; int down_width = width / scale_factor; int down_height = height / scale_factor; int start_w = down_pw - (kernel_size - 1) / 2; int end_w = down_pw + (kernel_size - 1) / 2 + 1; int start_h = down_ph - (kernel_size - 1) / 2; int end_h = down_ph + (kernel_size - 1) / 2 + 1; scalar_t output_val = 0; for (int iy = start_h; iy < end_h; iy++) { for (int ix = start_w; ix < end_w; ix++) { if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { continue; } int mask_iy = iy - down_ph + (kernel_size - 1) / 2; int mask_ix = ix - down_pw + (kernel_size - 1) / 2; int mask_c = (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; int feat_index = Loc2Index(n, c, iy, ix, channels, down_height, down_width); int mask_index = Loc2Index(n, mask_c, ph, pw, mask_channels, height, width); output_val += bottom_data[feat_index] * bottom_masks[mask_index]; } } top_data[index] = output_val; } } template __global__ void carafe_naive_backward_cuda_kernel( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data, const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the bottom_data int pw = index % width; int ph = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; int mask_channels = kernel_size * kernel_size * group_size; int mask_group = c / (channels / group_size); int down_pw = pw / scale_factor; int down_ph = ph / scale_factor; int down_width = width / scale_factor; int down_height = height / scale_factor; int start_w = down_pw - (kernel_size - 1) / 2; int end_w = down_pw + (kernel_size - 1) / 2 + 1; int start_h = down_ph - (kernel_size - 1) / 2; int end_h = down_ph + (kernel_size - 1) / 2 + 1; for (int iy = start_h; iy < end_h; iy++) { for (int ix = start_w; ix < end_w; ix++) { if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { continue; } int mask_iy = iy - down_ph + (kernel_size - 1) / 2; int mask_ix = ix - down_pw + (kernel_size - 1) / 2; int mask_c = (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; int feat_index = Loc2Index(n, c, iy, ix, channels, down_height, down_width); int mask_index = Loc2Index(n, mask_c, ph, pw, mask_channels, height, width); atomicAdd(bottom_diff + feat_index, bottom_masks[mask_index] * top_diff[index]); atomicAdd(mask_diff + mask_index, bottom_data[feat_index] * top_diff[index]); } } } } #endif // CARAFE_NAIVE_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu #ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH #define CHAMFER_DISTANCE_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144 template __global__ void chamfer_distance_forward_cuda_kernel(int b, int n, const scalar_t* xyz, int m, const scalar_t* xyz2, scalar_t* result, int* result_i) { __shared__ scalar_t buf[MAX_SHARED_SCALAR_T]; for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) { int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2; for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) { buf[j] = xyz2[(i * m + k2) * 2 + j]; } __syncthreads(); for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) { scalar_t x1 = xyz[(i * n + j) * 2 + 0]; scalar_t y1 = xyz[(i * n + j) * 2 + 1]; int best_i = 0; scalar_t best = 1e10; int end_ka = end_k & (~3); if (end_ka == THREADS_PER_BLOCK) { for (int k = 0; k < THREADS_PER_BLOCK; k += 4) { #pragma unroll for (int j = 0; j < 4; ++j) { scalar_t x2 = buf[(k + j) * 2] - x1; scalar_t y2 = buf[(k + j) * 2 + 1] - y1; scalar_t d = x2 * x2 + y2 * y2; if (d < best) { best = d; best_i = k + k2 + j; } } } } else { for (int k = 0; k < end_ka; k += 4) { #pragma unroll for (int j = 0; j < 4; ++j) { scalar_t x2 = buf[(k + j) * 2] - x1; scalar_t y2 = buf[(k + j) * 2 + 1] - y1; scalar_t d = x2 * x2 + y2 * y2; if (d < best) { best = d; best_i = k + k2 + j; } } } } for (int k = end_ka; k < end_k; k++) { scalar_t x2 = buf[k * 2 + 0] - x1; scalar_t y2 = buf[k * 2 + 1] - y1; scalar_t d = x2 * x2 + y2 * y2; if (k == 0 || d < best) { best = d; best_i = k + k2; } } if (k2 == 0 || result[(i * n + j)] > best) { result[(i * n + j)] = best; result_i[(i * n + j)] = best_i; } } __syncthreads(); } } } template __global__ void chamfer_distance_backward_cuda_kernel( int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2, const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1, scalar_t* grad_xyz2) { for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) { scalar_t x1 = xyz1[(i * n + j) * 2 + 0]; scalar_t y1 = xyz1[(i * n + j) * 2 + 1]; int j2 = idx1[i * n + j]; scalar_t x2 = xyz2[(i * m + j2) * 2 + 0]; scalar_t y2 = xyz2[(i * m + j2) * 2 + 1]; scalar_t g = grad_dist1[i * n + j] * 2; atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2)); atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2)); atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2))); atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2))); } } } #endif // CHAMFER_DISTANCE_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp ================================================ #ifndef COMMON_CUDA_HELPER #define COMMON_CUDA_HELPER #include #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) #define CUDA_2D_KERNEL_LOOP(i, n, j, m) \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) \ for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \ j += blockDim.y * gridDim.y) #define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \ for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \ for (size_t j = blockIdx.y; j < (m); j += gridDim.y) #define THREADS_PER_BLOCK 512 inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) { int optimal_block_num = (N + num_threads - 1) / num_threads; int max_block_num = 4096; return min(optimal_block_num, max_block_num); } template __device__ T bilinear_interpolate(const T* input, const int height, const int width, T y, T x, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) return 0; if (y <= 0) y = 0; if (x <= 0) x = 0; int y_low = (int)y; int x_low = (int)x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation T v1 = input[y_low * width + x_low]; T v2 = input[y_low * width + x_high]; T v3 = input[y_high * width + x_low]; T v4 = input[y_high * width + x_high]; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void bilinear_interpolate_gradient( const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4, int& x_low, int& x_high, int& y_low, int& y_high, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y <= 0) y = 0; if (x <= 0) x = 0; y_low = (int)y; x_low = (int)x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // reference in forward // T v1 = input[y_low * width + x_low]; // T v2 = input[y_low * width + x_high]; // T v3 = input[y_high * width + x_low]; // T v4 = input[y_high * width + x_high]; // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } #endif // COMMON_CUDA_HELPER ================================================ FILE: mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef CONVEX_IOU_CUDA_KERNEL_CUH #define CONVEX_IOU_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #define MAXN 100 #define NMAX 512 __device__ const double EPS = 1E-8; __device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); } struct Point { double x, y; __device__ Point() {} __device__ Point(double x, double y) : x(x), y(y) {} }; __device__ inline bool point_same(Point& a, Point& b) { return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0; } __device__ inline void swap1(Point* a, Point* b) { Point temp; temp.x = a->x; temp.y = a->y; a->x = b->x; a->y = b->y; b->x = temp.x; b->y = temp.y; } __device__ inline void reverse1(Point* a, const int n) { for (int i = 0; i < (n - 1) / 2.0; i++) { Point* j = &(a[i]); Point* k = &(a[n - 1 - i]); swap1(j, k); } } __device__ inline double cross(Point o, Point a, Point b) { return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y); } __device__ inline double dis(Point a, Point b) { return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y); } __device__ inline double area(Point* ps, int n) { ps[n] = ps[0]; double res = 0; for (int i = 0; i < n; i++) { res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x; } return res / 2.0; } __device__ inline double polygon_area_grad(Point* ps, int n, int* polygon_to_pred_index, int n_pred, double* grad_C) { ps[n] = ps[0]; double partion_grad[4 * 30 + 2]; double res = 0; for (int i = 0; i < n; i++) { res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x; partion_grad[i * 4 + 2] = ps[i + 1].y; partion_grad[i * 4 + 3] = -ps[i + 1].x; if (i != n - 1) { partion_grad[i * 4 + 4] = -ps[i].y; partion_grad[i * 4 + 5] = ps[i].x; } else { partion_grad[0] = -ps[i].y; partion_grad[1] = ps[i].x; } } for (int i = 0; i < n; i++) { for (int j = 0; j < n_pred; j++) { if (i == polygon_to_pred_index[j]) { grad_C[2 * polygon_to_pred_index[j + n_pred]] = (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2; break; } } for (int j = 0; j < n_pred; j++) { if (i == polygon_to_pred_index[j]) { grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] = (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2; break; } } } return res / 2.0; } __device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p, double* cut_grad, int m, int n, int i) { double s1, s2; double s2_s1_2; double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd; double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd; s1 = cross(a, b, c); s2 = cross(a, b, d); ds1_dxc = -(b.y - a.y); ds1_dyc = b.x - a.x; ds2_dxd = ds1_dxc; ds2_dyd = ds1_dyc; s2_s1_2 = (s2 - s1) * (s2 - s1); if (sig(s1) == 0 && sig(s2) == 0) return 2; if (sig(s2 - s1) == 0) return 0; dxp_dxc = ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) / (s2_s1_2); dxp_dyc = ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) / (s2_s1_2); dxp_dxd = ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) / (s2_s1_2); dxp_dyd = ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) / (s2_s1_2); dyp_dxc = ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) / (s2_s1_2); dyp_dyc = ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) / (s2_s1_2); dyp_dxd = ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) / (s2_s1_2); dyp_dyd = ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) / (s2_s1_2); p.x = (c.x * s2 - d.x * s1) / (s2 - s1); p.y = (c.y * s2 - d.y * s1) / (s2 - s1); if (i == n - 1) { cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc; cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc; cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc; cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc; cut_grad[4 * n * m + 0] = dxp_dxd; // + dyp_dxd; cut_grad[4 * n * m + 1] = dyp_dxd; cut_grad[4 * n * m + 2] = dxp_dyd; // + dyp_dyd; cut_grad[4 * n * m + 3] = dyp_dyd; } else { cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc; cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc; cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc; cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc; cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd; // + dyp_dxd; cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd; cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd; // + dyp_dyd; cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd; } return 1; } __device__ inline void polygon_cut(Point* p, int& n, Point a, Point b, double* cut_grad) { Point pp[MAXN]; double ccur_grad[MAXN] = {}; int m = 0; p[n] = p[0]; int k = n; for (int i = 0; i < n; i++) { if (sig(cross(a, b, p[i])) > 0) { pp[m] = p[i]; ccur_grad[4 * n * m + 4 * i] = 1.0; ccur_grad[4 * n * m + 4 * i + 3] = 1.0; m++; } if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) { lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i); m++; } } n = 0; for (int i = 0; i < m; i++) { if (!i || !(point_same(pp[i], pp[i - 1]))) { p[n] = pp[i]; for (int j = 0; j < 4 * k; j++) { cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j]; } n++; } } while (n > 1 && point_same(p[n - 1], p[0])) n--; } __device__ inline double intersectArea(Point a, Point b, Point c, Point d, double* grad_AB, int order, int convex_n) { Point o(0, 0); int res_flag = 0; int s1 = sig(cross(o, a, b)); int s2 = sig(cross(o, c, d)); if (s1 == 0 || s2 == 0) return 0.0; if (s1 == -1) { Point* i = &a; Point* j = &b; swap1(i, j); res_flag = 1; } if (s2 == -1) { Point* i = &c; Point* j = &d; swap1(i, j); } Point p[10] = {o, a, b}; int n = 3, n0 = 3, n1, n2, n3; double cut_grad1[MAXN] = {}; double cut_grad2[MAXN] = {}; double cut_grad3[MAXN] = {}; double p1_p_grad[10][10] = {}; double p2_p1_grad[10][10] = {}; double p3_p2_grad[10][10] = {}; double p3_p1_grad[10][10] = {}; double p3_p_grad[10][10] = {}; // 1 polygon_cut(p, n, o, c, cut_grad1); n1 = n; for (int i = 0; i < n; i++) { for (int j = 0; j < 4 * n0; j++) { if (!(j % 2)) { p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j]; } else { p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j]; } } } // 2 polygon_cut(p, n, c, d, cut_grad2); n2 = n; for (int i = 0; i < n; i++) { for (int j = 0; j < 4 * n1; j++) { if (!(j % 2)) { p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j]; } else { p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j]; } } } // 3 polygon_cut(p, n, d, o, cut_grad3); n3 = n; for (int i = 0; i < n; i++) { for (int j = 0; j < 4 * n2; j++) { if (!(j % 2)) { p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j]; } else { p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j]; } } } // mul // p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1) for (int i = 0; i < 2 * n3; i++) { for (int j = 0; j < 2 * n1; j++) { double sum = 0.0; for (int m = 0; m < 2 * n2; m++) { sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j]; } p3_p1_grad[i][j] = sum; } } // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0) for (int i = 0; i < 2 * n3; i++) { for (int j = 0; j < 2 * n0; j++) { double sum = 0.0; for (int m = 0; m < 2 * n1; m++) { sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j]; } p3_p_grad[i][j] = sum; } } // calculate S_grad int polygon_index_box_index[20]; double grad_polygon[20]; double S_grad[6]; for (int i = 0; i < n3; i++) { polygon_index_box_index[i] = i; polygon_index_box_index[i + n3] = i; } double res = polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon); if (s1 * s2 == -1) { for (int j = 0; j < 2 * 3; j++) { double sum = 0.0; for (int m = 0; m < 2 * n3; m++) { sum = sum - grad_polygon[m] * p3_p_grad[m][j]; } S_grad[j] = sum; } if (order != convex_n - 1) { if (res_flag) { grad_AB[2 * order] += S_grad[4]; grad_AB[2 * order + 1] += S_grad[5]; grad_AB[2 * order + 2] += S_grad[2]; grad_AB[2 * order + 3] += S_grad[3]; } else { grad_AB[2 * order] += S_grad[2]; grad_AB[2 * order + 1] += S_grad[3]; grad_AB[2 * order + 2] += S_grad[4]; grad_AB[2 * order + 3] += S_grad[5]; } } else { if (res_flag) { grad_AB[2 * order] += S_grad[4]; grad_AB[2 * order + 1] += S_grad[5]; grad_AB[0] += S_grad[2]; grad_AB[1] += S_grad[3]; } else { grad_AB[2 * order] += S_grad[2]; grad_AB[2 * order + 1] += S_grad[3]; grad_AB[0] += S_grad[4]; grad_AB[1] += S_grad[5]; } } res = -res; } else { for (int j = 0; j < 2 * 3; j++) { double sum = 0.0; for (int m = 0; m < 2 * n3; m++) { sum = sum + grad_polygon[m] * p3_p_grad[m][j]; } S_grad[j] = sum; } if (order != convex_n - 1) { if (res_flag) { grad_AB[2 * order] += S_grad[4]; grad_AB[2 * order + 1] += S_grad[5]; grad_AB[2 * order + 2] += S_grad[2]; grad_AB[2 * order + 3] += S_grad[3]; } else { grad_AB[2 * order] += S_grad[2]; grad_AB[2 * order + 1] += S_grad[3]; grad_AB[2 * order + 2] += S_grad[4]; grad_AB[2 * order + 3] += S_grad[5]; } } else { if (res_flag) { grad_AB[2 * order] += S_grad[4]; grad_AB[2 * order + 1] += S_grad[5]; grad_AB[0] += S_grad[2]; grad_AB[1] += S_grad[3]; } else { grad_AB[2 * order] += S_grad[2]; grad_AB[2 * order + 1] += S_grad[3]; grad_AB[0] += S_grad[4]; grad_AB[1] += S_grad[5]; } } } return res; } __device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2, double* grad_AB) { if (area(ps1, n1) < 0) reverse1(ps1, n1); if (area(ps2, n2) < 0) reverse1(ps2, n2); ps1[n1] = ps1[0]; ps2[n2] = ps2[0]; double res = 0; for (int i = 0; i < n1; i++) { for (int j = 0; j < n2; j++) { res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1); } } return res; } __device__ inline void Jarvis(Point* in_poly, int& n_poly) { Point p_max, p_k; int max_index, k_index; int Stack[NMAX] = {}, top1, top2; double sign; Point right_point[10], left_point[10]; for (int i = 0; i < n_poly; i++) { if (in_poly[i].y < in_poly[0].y || in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { Point* j = &(in_poly[0]); Point* k = &(in_poly[i]); swap1(j, k); } if (i == 0) { p_max = in_poly[0]; max_index = 0; } if (in_poly[i].y > p_max.y || in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { p_max = in_poly[i]; max_index = i; } } if (max_index == 0) { max_index = 1; p_max = in_poly[max_index]; } k_index = 0, Stack[0] = 0, top1 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > dis(in_poly[Stack[top1]], p_k)))) { p_k = in_poly[i]; k_index = i; } } top1++; Stack[top1] = k_index; } for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]]; k_index = 0, Stack[0] = 0, top2 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > dis(in_poly[Stack[top2]], p_k))) { p_k = in_poly[i]; k_index = i; } } top2++; Stack[top2] = k_index; } for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]]; for (int i = 0; i < top1 + top2; i++) { if (i <= top1) { in_poly[i] = right_point[i]; } else { in_poly[i] = left_point[top2 - (i - top1)]; } } n_poly = top1 + top2; } __device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2, int n2, double* grad_C) { Point polygon[MAXN]; int n = n1 + n2, n_poly = 0; for (int i = 0; i < n1; i++) { for (int j = 0; j < n - n1; j++) { if (point_same(ps1[i], ps2[j])) { for (int k = j; k < n - n1 - 1; k++) { ps2[k] = ps2[k + 1]; } n2--; break; } } } n_poly = n1 + n2; for (int i = 0; i < n_poly; i++) { if (i < n1) { polygon[i] = ps1[i]; } else { polygon[i] = ps2[i - n1]; } } Jarvis(polygon, n_poly); int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; int n_pred = 0; for (int i = 0; i < n_poly; i++) { for (int j = 0; j < n1; j++) { if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) { polygon_to_pred_index[n_pred] = i; polygon_to_pred_index[n_pred + n1] = j; n_pred += 1; break; } } } if (n_pred == 0) { double polygon_area = fabs(area(polygon, n_poly)); for (int i = 0; i < 18; i++) { grad_C[i] = 0.0; } return polygon_area; } else { double polygon_area = polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C); if (polygon_area < 0) { for (int i = 0; i < 18; i++) { grad_C[i] = -grad_C[i]; } } return fabs(polygon_area); } } // convex_find and get the polygon_index_box_index __device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly, int* points_to_convex_ind) { int n_input = n_poly; Point input_poly[20]; for (int i = 0; i < n_input; i++) { input_poly[i].x = in_poly[i].x; input_poly[i].y = in_poly[i].y; } Point p_max, p_k; int max_index, k_index; int Stack[20], top1, top2; double sign; Point right_point[10], left_point[10]; for (int i = 0; i < n_poly; i++) { if (in_poly[i].y < in_poly[0].y || in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { Point* j = &(in_poly[0]); Point* k = &(in_poly[i]); swap1(j, k); } if (i == 0) { p_max = in_poly[0]; max_index = 0; } if (in_poly[i].y > p_max.y || in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { p_max = in_poly[i]; max_index = i; } } if (max_index == 0) { max_index = 1; p_max = in_poly[max_index]; } k_index = 0, Stack[0] = 0, top1 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > dis(in_poly[Stack[top1]], p_k)))) { p_k = in_poly[i]; k_index = i; } } top1++; Stack[top1] = k_index; } for (int i = 0; i <= top1; i++) { right_point[i] = in_poly[Stack[i]]; } k_index = 0, Stack[0] = 0, top2 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > dis(in_poly[Stack[top2]], p_k))) { p_k = in_poly[i]; k_index = i; } } top2++; Stack[top2] = k_index; } for (int i = top2 - 1; i >= 0; i--) { left_point[i] = in_poly[Stack[i]]; } for (int i = 0; i < top1 + top2; i++) { if (i <= top1) { in_poly[i] = right_point[i]; } else { in_poly[i] = left_point[top2 - (i - top1)]; } } n_poly = top1 + top2; for (int i = 0; i < n_poly; i++) { for (int j = 0; j < n_input; j++) { if (point_same(in_poly[i], input_poly[j])) { points_to_convex_ind[i] = j; break; } } } } template __device__ inline float devrIoU(T const* const p, T const* const q, T* point_grad, const int idx) { Point ps1[MAXN], ps2[MAXN]; Point convex[MAXN]; for (int i = 0; i < 9; i++) { convex[i].x = (double)p[i * 2]; convex[i].y = (double)p[i * 2 + 1]; } int n_convex = 9; int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; Jarvis_and_index(convex, n_convex, points_to_convex_ind); int n1 = n_convex; int n2 = 4; for (int i = 0; i < n1; i++) { ps1[i].x = (double)convex[i].x; ps1[i].y = (double)convex[i].y; } for (int i = 0; i < n2; i++) { ps2[i].x = (double)q[i * 2]; ps2[i].y = (double)q[i * 2 + 1]; } int polygon_index_box_index[18]; for (int i = 0; i < n1; i++) { polygon_index_box_index[i] = i; polygon_index_box_index[i + n1] = i; } double grad_A[18] = {}; double grad_AB[18] = {}; double grad_C[18] = {}; double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB); double S_pred = polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A); if (S_pred < 0) { for (int i = 0; i < n_convex * 2; i++) { grad_A[i] = -grad_A[i]; } } double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area; double iou = inter_area / union_area; double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C); // printf("%d:live\n", idx); double rot_giou = iou - (polygon_area - union_area) / polygon_area; float grad_point_temp[18] = {}; for (int i = 0; i < n_convex; i++) { int grad_point = points_to_convex_ind[i]; grad_point_temp[2 * grad_point] = (float)((union_area + inter_area) / (union_area * union_area) * grad_AB[2 * i] - iou / union_area * grad_A[2 * i] - 1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) - (union_area) / polygon_area / polygon_area * grad_C[2 * i]); grad_point_temp[2 * grad_point + 1] = (float)((union_area + inter_area) / (union_area * union_area) * grad_AB[2 * i + 1] - iou / union_area * grad_A[2 * i + 1] - 1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) - (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]); } for (int i = 0; i < 9; i++) { point_grad[2 * i] = grad_point_temp[2 * i]; point_grad[2 * i + 1] = grad_point_temp[2 * i + 1]; } return (float)rot_giou; } template __global__ void convex_giou_cuda_kernel(const int ex_n_boxes, const int gt_n_boxes, const T* ex_boxes, const T* gt_boxes, T* point_grad) { CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { const T* cur_box = ex_boxes + index * 18; const T* cur_gt_box = gt_boxes + index * 8; T* cur_grad = point_grad + index * 19; T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x); cur_grad[18] = giou; } } __device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) { double s1, s2; s1 = cross(a, b, c); s2 = cross(a, b, d); if (sig(s1) == 0 && sig(s2) == 0) return 2; if (sig(s2 - s1) == 0) return 0; p.x = (c.x * s2 - d.x * s1) / (s2 - s1); p.y = (c.y * s2 - d.y * s1) / (s2 - s1); return 1; } __device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) { Point pp[MAXN]; int m = 0; p[n] = p[0]; for (int i = 0; i < n; i++) { if (sig(cross(a, b, p[i])) > 0) { pp[m] = p[i]; m++; } if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) { lineCross(a, b, p[i], p[i + 1], pp[m]); m++; } } n = 0; for (int i = 0; i < m; i++) { if (!i || !(point_same(pp[i], pp[i - 1]))) { p[n] = pp[i]; n++; } } while (n > 1 && point_same(p[n - 1], p[0])) n--; } __device__ inline double intersectArea(Point a, Point b, Point c, Point d) { Point o(0, 0); int s1 = sig(cross(o, a, b)); int s2 = sig(cross(o, c, d)); if (s1 == 0 || s2 == 0) return 0.0; if (s1 == -1) { Point* i = &a; Point* j = &b; swap1(i, j); } if (s2 == -1) { Point* i = &c; Point* j = &d; swap1(i, j); } Point p[10] = {o, a, b}; int n = 3; polygon_cut(p, n, o, c); polygon_cut(p, n, c, d); polygon_cut(p, n, d, o); double res = area(p, n); if (s1 * s2 == -1) res = -res; return res; } __device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2) { if (area(ps1, n1) < 0) reverse1(ps1, n1); if (area(ps2, n2) < 0) reverse1(ps2, n2); ps1[n1] = ps1[0]; ps2[n2] = ps2[0]; double res = 0; for (int i = 0; i < n1; i++) { for (int j = 0; j < n2; j++) { res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]); } } return res; } template __device__ inline float devrIoU(T const* const p, T const* const q) { Point ps1[MAXN], ps2[MAXN]; Point convex[MAXN]; for (int i = 0; i < 9; i++) { convex[i].x = (double)p[i * 2]; convex[i].y = (double)p[i * 2 + 1]; } int n_convex = 9; int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; Jarvis_and_index(convex, n_convex, points_to_convex_ind); int n1 = n_convex; for (int i = 0; i < n1; i++) { ps1[i].x = (double)convex[i].x; ps1[i].y = (double)convex[i].y; } int n2 = 4; for (int i = 0; i < n2; i++) { ps2[i].x = (double)q[i * 2]; ps2[i].y = (double)q[i * 2 + 1]; } double inter_area = intersectAreaO(ps1, n1, ps2, n2); double S_pred = area(ps1, n1); double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area; double iou = inter_area / union_area; return (float)iou; } template __global__ void convex_iou_cuda_kernel(const int ex_n_boxes, const int gt_n_boxes, const T* ex_boxes, const T* gt_boxes, T* iou) { CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { const T* cur_box = ex_boxes + index * 18; for (int i = 0; i < gt_n_boxes; i++) { iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8); } } } #endif // CONVEX_IOU_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/correlation_cuda.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu // Original licence: Under MIT License #ifndef CORRELATION_CUDA #define CORRELATION_CUDA #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #include #include // Using is recommended in the official documentation in // https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op. // However, we use for compatibility with CUDA 9.0 // Read https://github.com/pytorch/extension-cpp/issues/35 for more details. #include #include #include using namespace torch; #define TensorAcc4R PackedTensorAccessor32 #define TensorAcc5R PackedTensorAccessor32 #define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W) #define WARP_SIZE 32 #define FULL_MASK 0xffffffff template __global__ void correlation_forward_cuda_kernel( const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output, int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH, int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW, int oH, int oW) { const int iH = rInput1.size(1); const int iW = rInput1.size(2); const int C = rInput1.size(3); const int n = blockIdx.x; const int h = blockIdx.y * blockDim.y + threadIdx.y; const int w = blockIdx.z * blockDim.z + threadIdx.z; if (h >= oH || w >= oW) return; const int thread = threadIdx.x; const int start_i = -padH + h * dH; const int start_j = -padW + w * dW; const int patchRadH = dilation_patchH * (patchH - 1) / 2; const int patchRadW = dilation_patchW * (patchW - 1) / 2; for (int ph = 0; ph < patchH; ++ph) { int ph_dilated = ph * dilation_patchH - patchRadH; for (int pw = 0; pw < patchW; ++pw) { int pw_dilated = pw * dilation_patchW - patchRadW; scalar_t prod_sum = 0.0f; for (int i = 0; i < kH; ++i) { int i1 = start_i + i * dilationH; int i2 = i1 + ph_dilated; if (WITHIN_BOUNDS(i1, i2, iH, iH)) { for (int j = 0; j < kW; ++j) { int j1 = start_j + j * dilationW; int j2 = j1 + pw_dilated; if (WITHIN_BOUNDS(j1, j2, iW, iW)) { for (int c = thread; c < C; c += WARP_SIZE) { scalar_t v1 = rInput1[n][i1][j1][c]; scalar_t v2 = rInput2[n][i2][j2][c]; prod_sum += v1 * v2; } } } } } // accumulate for (int offset = 16; offset > 0; offset /= 2) #ifdef MMCV_WITH_HIP prod_sum += __shfl_down(float(prod_sum), offset); #else prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset); #endif if (thread == 0) { output[n][ph][pw][h][w] = prod_sum; } } } } template __global__ void correlation_backward_cuda_kernel_input1( const TensorAcc5R grad_output, const TensorAcc4R input2, TensorAcc4R grad_input1, const int kH, const int kW, const int patchH, const int patchW, const int padH, const int padW, const int dilationH, const int dilationW, const int dilation_patchH, const int dilation_patchW, const int dH, const int dW) { const int iH = input2.size(1); const int iW = input2.size(2); const int C = input2.size(3); const int H = grad_output.size(3); const int W = grad_output.size(4); const int patchRadH = (patchH - 1) / 2; const int patchRadW = (patchW - 1) / 2; const int n = blockIdx.x; const int h = blockIdx.y; const int w = blockIdx.z; const int h_2 = h + padH; const int w_2 = w + padW; const int min_h = h_2 - kH * dilationH; const int min_w = w_2 - kW * dilationW; extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[]; scalar_t *grad_cache = reinterpret_cast(grad_cache_char); for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) { const int ph = i / patchW; const int pw = i % patchW; int i1 = h + dilation_patchH * (ph - patchRadH); int j1 = w + dilation_patchW * (pw - patchRadW); if (WITHIN_BOUNDS(i1, j1, iH, iW)) { scalar_t grad_val = 0.0f; for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { int i2 = (h_3) / dH; if (i2 * dH != h_3) continue; for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { int j2 = (w_3) / dW; if (j2 * dW != w_3) continue; if (WITHIN_BOUNDS(i2, j2, H, W)) { grad_val += grad_output[n][ph][pw][i2][j2]; } } } grad_cache[i] = grad_val; } } __syncthreads(); for (int c = threadIdx.x; c < C; c += blockDim.x) { scalar_t grad_input_val = 0.0f; for (int ph = 0; ph < patchH; ++ph) { int i1 = h + dilation_patchH * (ph - patchRadH); for (int pw = 0; pw < patchW; ++pw) { int j1 = w + dilation_patchW * (pw - patchRadW); if (WITHIN_BOUNDS(i1, j1, iH, iW)) { grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw]; } } } grad_input1[n][c][h][w] = grad_input_val; } } template __global__ void correlation_backward_cuda_kernel_input2( const TensorAcc5R grad_output, const TensorAcc4R input1, TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH, int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW) { const int iH = input1.size(1); const int iW = input1.size(2); const int C = input1.size(3); const int patchRadH = (patchH - 1) / 2; const int patchRadW = (patchW - 1) / 2; const int H = grad_output.size(3); const int W = grad_output.size(4); const int dilatedKH = kH * dilationH; const int dilatedKW = kW * dilationW; const int n = blockIdx.x; const int h = blockIdx.y; const int w = blockIdx.z; extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[]; scalar_t *grad_cache = reinterpret_cast(grad_cache_char); for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) { const int ph = i / patchW; const int pw = i % patchW; int i1 = h - dilation_patchH * (ph - patchRadH); int j1 = w - dilation_patchW * (pw - patchRadW); if (WITHIN_BOUNDS(i1, j1, iH, iW)) { scalar_t grad_val = 0.0f; const int h_2 = i1 + padH; const int w_2 = j1 + padW; const int min_h = h_2 - dilatedKH; const int min_w = w_2 - dilatedKW; for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { int i2 = (h_3) / dH; if (i2 * dH != h_3) continue; for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { int j2 = (w_3) / dW; if (j2 * dW != w_3) continue; if (WITHIN_BOUNDS(i2, j2, H, W)) { grad_val += grad_output[n][ph][pw][i2][j2]; } } } grad_cache[i] = grad_val; } } __syncthreads(); for (int c = threadIdx.x; c < C; c += blockDim.x) { scalar_t grad_input_val = 0.0f; for (int ph = 0; ph < patchH; ++ph) { int i1 = h - dilation_patchH * (ph - patchRadH); for (int pw = 0; pw < patchW; ++pw) { int j1 = w - dilation_patchW * (pw - patchRadW); if (WITHIN_BOUNDS(i1, j1, iH, iW)) { grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw]; } } } grad_input2[n][c][h][w] = grad_input_val; } } #endif ================================================ FILE: mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer ***************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, *this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ********************* * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.cuh * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng */ // modified from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu #ifndef DEFORM_CONV_CUDA_KERNEL_CUH #define DEFORM_CONV_CUDA_KERNEL_CUH #include #ifdef MMCV_WITH_TRT #include "common_cuda_helper.hpp" #else // MMCV_WITH_TRT #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else // MMCV_USE_PARROTS #include "pytorch_cuda_helper.hpp" #endif // MMCV_USE_PARROTS #endif // MMCV_WITH_TRT template __device__ T deformable_im2col_bilinear(const T *input, const int data_width, const int height, const int width, T h, T w) { if (h <= -1 || height <= h || w <= -1 || width <= w) { return 0; } int h_low = floorf(h); int w_low = floorf(w); int h_high = h_low + 1; int w_high = w_low + 1; T lh = h - h_low; T lw = w - w_low; T hh = 1 - lh, hw = 1 - lw; T v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low]; T v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = input[h_low * data_width + w_high]; T v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = input[h_high * data_width + w_low]; T v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = input[h_high * data_width + w_high]; T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floorf(argmax_h); int argmax_w_low = floorf(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; T weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height, const int width, const T *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floorf(argmax_h); int argmax_w_low = floorf(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; T weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void deformable_im2col_gpu_kernel( const int n, const T *data_im, const T *data_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, T *data_col) { CUDA_1D_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; T *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; const T *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const T *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; T val = static_cast(0); const T h_im = h_in + i * dilation_h + offset_h; const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); *data_col_ptr = val; data_col_ptr += batch_size * height_col * width_col; } } } } template __global__ void deformable_col2im_gpu_kernel( const int n, const T *data_col, const T *data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, T *grad_im) { CUDA_1D_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const T *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; const T cur_inv_h_data = h_in + i * dilation_h + offset_h; const T cur_inv_w_data = w_in + j * dilation_w + offset_w; const T cur_top_grad = data_col[index]; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } template __global__ void deformable_col2im_coord_gpu_kernel( const int n, const T *data_col, const T *data_im, const T *data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, T *grad_offset) { CUDA_1D_KERNEL_LOOP(index, n) { T val = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const T *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const T *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const T *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; T inv_h = h_in + i * dilation_h + offset_h; T inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) inv_h = inv_w = -2; const T weight = get_coordinate_weight(inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos]; cnt += 1; } grad_offset[index] = val; } } #endif // DEFORM_CONV_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH #define DEFORM_ROI_POOL_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void deform_roi_pool_forward_cuda_kernel( const int nthreads, const T* input, const T* rois, const T* offset, T* output, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, const T gamma, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start_w = offset_rois[1] * spatial_scale - 0.5; T roi_start_h = offset_rois[2] * spatial_scale - 0.5; T roi_end_w = offset_rois[3] * spatial_scale - 0.5; T roi_end_h = offset_rois[4] * spatial_scale - 0.5; T roi_width = roi_end_w - roi_start_w; T roi_height = roi_end_h - roi_start_h; T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* offset_input = input + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_height / pooled_height)); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_width / pooled_width)); // Compute roi offset if (offset != NULL) { const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 + ph * pooled_width + pw; T offset_roi_w = gamma * roi_width * offset_cur_w[0]; T offset_roi_h = gamma * roi_height * offset_cur_w[pooled_width * pooled_height]; roi_start_w += offset_roi_w; roi_start_h += offset_roi_h; } // We do average pooling inside a bin const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_input, height, width, y, x, index); output_val += val; } } output[index] = output_val / count; } } template __global__ void deform_roi_pool_backward_cuda_kernel( const int nthreads, const T* grad_output, const T* input, const T* rois, const T* offset, T* grad_input, T* grad_offset, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, const T gamma, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; const T* offset_input = input + ((roi_batch_ind * channels + c) * height * width); T* offset_grad_input = grad_input + ((roi_batch_ind * channels + c) * height * width); // Do not using rounding; this implementation detail is critical T roi_start_w = offset_rois[1] * spatial_scale - 0.5; T roi_start_h = offset_rois[2] * spatial_scale - 0.5; T roi_end_w = offset_rois[3] * spatial_scale - 0.5; T roi_end_h = offset_rois[4] * spatial_scale - 0.5; T roi_width = roi_end_w - roi_start_w; T roi_height = roi_end_h - roi_start_h; T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_height / pooled_height)); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_width / pooled_width)); // Compute roi offset if (offset != NULL) { const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 + ph * pooled_width + pw; T offset_roi_w = gamma * roi_width * offset_cur_w[0]; T offset_roi_h = gamma * roi_height * offset_cur_w[pooled_width * pooled_height]; roi_start_w += offset_roi_w; roi_start_h += offset_roi_h; } // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 const T grad_output_this_bin = grad_output[index] / count; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_grad_input + y_low * width + x_low, grad_output_this_bin * w1); atomicAdd(offset_grad_input + y_low * width + x_high, grad_output_this_bin * w2); atomicAdd(offset_grad_input + y_high * width + x_low, grad_output_this_bin * w3); atomicAdd(offset_grad_input + y_high * width + x_high, grad_output_this_bin * w4); if (offset != NULL) { T input_00 = offset_input[y_low * width + x_low]; T input_10 = offset_input[y_low * width + x_high]; T input_01 = offset_input[y_high * width + x_low]; T input_11 = offset_input[y_high * width + x_high]; T ogx = gamma * roi_width * grad_output_this_bin * (input_11 * (y - y_low) + input_10 * (y_high - y) + input_01 * (y_low - y) + input_00 * (y - y_high)); T ogy = gamma * roi_height * grad_output_this_bin * (input_11 * (x - x_low) + input_01 * (x_high - x) + input_10 * (x_low - x) + input_00 * (x - x_high)); atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 + ph * pooled_width + pw, ogx); atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 + pooled_width * pooled_height + ph * pooled_width + pw, ogy); } } } } } } #endif // DEFORM_ROI_POOL_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Adapted from // https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #define MAX_NUM_VERT_IDX 9 #define INTERSECTION_OFFSET 8 #define EPSILON 1e-8 inline int opt_n_thread(int work_size) { const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); return max(min(1 << pow_2, THREADS_PER_BLOCK), 1); } /* compare normalized vertices (vertices around (0,0)) if vertex1 < vertex2 return true. order: minimum at x-aixs, become larger in anti-clockwise direction */ __device__ bool compare_vertices(float x1, float y1, float x2, float y2) { if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON) return false; // if equal, return false if (y1 > 0 && y2 < 0) return true; if (y1 < 0 && y2 > 0) return false; float n1 = x1 * x1 + y1 * y1 + EPSILON; float n2 = x2 * x2 + y2 * y2 + EPSILON; float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2; if (y1 > 0 && y2 > 0) { if (diff > EPSILON) return true; else return false; } if (y1 < 0 && y2 < 0) { if (diff < EPSILON) return true; else return false; } return false; } __global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel( int b, int n, int m, const float *__restrict__ vertices, const bool *__restrict__ mask, const int *__restrict__ num_valid, int *__restrict__ idx) { int batch_idx = blockIdx.x; vertices += batch_idx * n * m * 2; mask += batch_idx * n * m; num_valid += batch_idx * n; idx += batch_idx * n * MAX_NUM_VERT_IDX; int index = threadIdx.x; // index of polygon int stride = blockDim.x; for (int i = index; i < n; i += stride) { int pad; // index of arbitrary invalid intersection point (not box corner!) for (int j = INTERSECTION_OFFSET; j < m; ++j) { if (!mask[i * m + j]) { pad = j; break; } } if (num_valid[i] < 3) { // not enough vertices, take an invalid intersection point // (zero padding) for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) { idx[i * MAX_NUM_VERT_IDX + j] = pad; } } else { // sort the valid vertices // note the number of valid vertices is known // note: check that num_valid[i] < MAX_NUM_VERT_IDX for (int j = 0; j < num_valid[i]; ++j) { // initialize with a "big" value float x_min = 1; float y_min = -EPSILON; int i_take = 0; int i2; float x2, y2; if (j != 0) { i2 = idx[i * MAX_NUM_VERT_IDX + j - 1]; x2 = vertices[i * m * 2 + i2 * 2 + 0]; y2 = vertices[i * m * 2 + i2 * 2 + 1]; } for (int k = 0; k < m; ++k) { float x = vertices[i * m * 2 + k * 2 + 0]; float y = vertices[i * m * 2 + k * 2 + 1]; if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) { if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) { x_min = x; y_min = y; i_take = k; } } } idx[i * MAX_NUM_VERT_IDX + j] = i_take; } // duplicate the first idx idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0]; // pad zeros for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) { idx[i * MAX_NUM_VERT_IDX + j] = pad; } // for corner case: the two boxes are exactly the same. // in this case, idx would have duplicate elements, which makes the // shoelace formula broken because of the definition, the duplicate // elements only appear in the first 8 positions (they are "corners in // box", not "intersection of edges") if (num_valid[i] == 8) { int counter = 0; for (int j = 0; j < 4; ++j) { int check = idx[i * MAX_NUM_VERT_IDX + j]; for (int k = 4; k < INTERSECTION_OFFSET; ++k) { if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++; } } if (counter == 4) { idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0]; for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) { idx[i * MAX_NUM_VERT_IDX + j] = pad; } } } // TODO: still might need to cover some other corner cases :( } } } ================================================ FILE: mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH #define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif __device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2) { const float v1 = dists[idx1], v2 = dists[idx2]; const int i1 = dists_i[idx1], i2 = dists_i[idx2]; dists[idx1] = max(v1, v2); dists_i[idx1] = v2 > v1 ? i2 : i1; } template __global__ void furthest_point_sampling_forward_cuda_kernel( int b, int n, int m, const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) { // dataset: (B, N, 3) // tmp: (B, N) // output: // idx: (B, M) if (m <= 0) return; __shared__ float dists[block_size]; __shared__ int dists_i[block_size]; int batch_index = blockIdx.x; dataset += batch_index * n * 3; temp += batch_index * n; idxs += batch_index * m; int tid = threadIdx.x; const int stride = block_size; int old = 0; if (threadIdx.x == 0) idxs[0] = old; __syncthreads(); for (int j = 1; j < m; j++) { int besti = 0; float best = -1; float x1 = dataset[old * 3 + 0]; float y1 = dataset[old * 3 + 1]; float z1 = dataset[old * 3 + 2]; for (int k = tid; k < n; k += stride) { float x2, y2, z2; x2 = dataset[k * 3 + 0]; y2 = dataset[k * 3 + 1]; z2 = dataset[k * 3 + 2]; // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); // if (mag <= 1e-3) // continue; float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); float d2 = min(d, temp[k]); temp[k] = d2; besti = d2 > best ? k : besti; best = d2 > best ? d2 : best; } dists[tid] = best; dists_i[tid] = besti; __syncthreads(); #pragma unroll for (int block_size_thres = 1024; block_size_thres >= 2; block_size_thres >>= 1) { const int tid_thres = block_size_thres / 2; if (block_size >= block_size_thres && tid < tid_thres) { __update(dists, dists_i, tid, tid + tid_thres); } __syncthreads(); } old = dists_i[0]; if (tid == 0) idxs[j] = old; } } // Modified from // https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu template __global__ void furthest_point_sampling_with_dist_forward_cuda_kernel( int b, int n, int m, const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) { // dataset: (B, N, N) // tmp: (B, N) // output: // idx: (B, M) if (m <= 0) return; __shared__ float dists[block_size]; __shared__ int dists_i[block_size]; int batch_index = blockIdx.x; dataset += batch_index * n * n; temp += batch_index * n; idxs += batch_index * m; int tid = threadIdx.x; const int stride = block_size; int old = 0; if (threadIdx.x == 0) idxs[0] = old; __syncthreads(); for (int j = 1; j < m; j++) { int besti = 0; float best = -1; // float x1 = dataset[old * 3 + 0]; // float y1 = dataset[old * 3 + 1]; // float z1 = dataset[old * 3 + 2]; for (int k = tid; k < n; k += stride) { // float x2, y2, z2; // x2 = dataset[k * 3 + 0]; // y2 = dataset[k * 3 + 1]; // z2 = dataset[k * 3 + 2]; // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * // (z2 - z1); float d = dataset[old * n + k]; float d2 = min(d, temp[k]); temp[k] = d2; besti = d2 > best ? k : besti; best = d2 > best ? d2 : best; } dists[tid] = best; dists_i[tid] = besti; __syncthreads(); #pragma unroll for (int block_size_thres = 1024; block_size_thres >= 2; block_size_thres >>= 1) { const int tid_thres = block_size_thres / 2; if (block_size >= block_size_thres && tid < tid_thres) { __update(dists, dists_i, tid, tid + tid_thres); } __syncthreads(); } old = dists_i[0]; if (tid == 0) idxs[j] = old; } } #endif // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef GATHER_POINTS_CUDA_KERNEL_CUH #define GATHER_POINTS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #define TOTAL_THREADS 1024 template __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m, const T *points, const int *__restrict__ idx, T *out) { // points: (B, C, N) // idx: (B, M) // output: // out: (B, C, M) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, m) { if (bs_idx >= b || c_idx >= c) return; out += bs_idx * c * m + c_idx * m + pt_idx; idx += bs_idx * m + pt_idx; points += bs_idx * c * n + c_idx * n; out[0] = points[idx[0]]; } } template __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx, T *grad_points) { // grad_out: (B, C, M) // idx: (B, M) // output: // grad_points: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, m) { if (bs_idx >= b || c_idx >= c) return; grad_out += bs_idx * c * m + c_idx * m + pt_idx; idx += bs_idx * m + pt_idx; grad_points += bs_idx * c * n + c_idx * n; atomicAdd(grad_points + idx[0], grad_out[0]); } } #endif // GATHER_POINTS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu #ifndef GROUP_POINTS_CUDA_KERNEL_CUH #define GROUP_POINTS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void group_points_forward_cuda_kernel(int b, int c, int n, int npoints, int nsample, const T *points, const int *__restrict__ idx, T *out) { // points: (B, C, N) // idx: (B, npoints, nsample) // output: // out: (B, C, npoints, nsample) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(index, npoints * nsample) { if (bs_idx >= b || c_idx >= c) return; int pt_idx = index / nsample; int sample_idx = index % nsample; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; int in_idx = bs_idx * c * n + c_idx * n + idx[0]; int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; out[out_idx] = points[in_idx]; } } template __global__ void group_points_backward_cuda_kernel(int b, int c, int n, int npoints, int nsample, const T *grad_out, const int *__restrict__ idx, T *grad_points) { // grad_out: (B, C, npoints, nsample) // idx: (B, npoints, nsample) // output: // grad_points: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(index, npoints * nsample) { int pt_idx = index / nsample; if (bs_idx >= b || c_idx >= c) return; int sample_idx = index % nsample; grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]); } } #endif // GROUP_POINTS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef IOU3D_CUDA_KERNEL_CUH #define IOU3D_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif const int THREADS_PER_BLOCK_IOU3D = 16; const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; __device__ const float EPS = 1e-8; struct Point { float x, y; __device__ Point() {} __device__ Point(double _x, double _y) { x = _x, y = _y; } __device__ void set(float _x, float _y) { x = _x; y = _y; } __device__ Point operator+(const Point &b) const { return Point(x + b.x, y + b.y); } __device__ Point operator-(const Point &b) const { return Point(x - b.x, y - b.y); } }; __device__ inline float cross(const Point &a, const Point &b) { return a.x * b.y - a.y * b.x; } __device__ inline float cross(const Point &p1, const Point &p2, const Point &p0) { return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); } __device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2) { int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && min(q1.x, q2.x) <= max(p1.x, p2.x) && min(p1.y, p2.y) <= max(q1.y, q2.y) && min(q1.y, q2.y) <= max(p1.y, p2.y); return ret; } __device__ inline int check_in_box2d(const float *box, const Point &p) { // params: box (7) [x, y, z, dx, dy, dz, heading] const float MARGIN = 1e-2; float center_x = box[0], center_y = box[1]; // rotate the point in the opposite direction of box float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]); float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN); } __device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans_point) { // fast exclusion if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; // check cross standing float s1 = cross(q0, p1, p0); float s2 = cross(p1, q1, p0); float s3 = cross(p0, q1, q0); float s4 = cross(q1, p1, q0); if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; // calculate intersection of two lines float s5 = cross(q1, p1, p0); if (fabs(s5 - s1) > EPS) { ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); } else { float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; float D = a0 * b1 - a1 * b0; ans_point.x = (b0 * c1 - b1 * c0) / D; ans_point.y = (a1 * c0 - a0 * c1) / D; } return 1; } __device__ inline void rotate_around_center(const Point ¢er, const float angle_cos, const float angle_sin, Point &p) { float new_x = (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x; float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; p.set(new_x, new_y); } __device__ inline int point_cmp(const Point &a, const Point &b, const Point ¢er) { return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x); } __device__ inline float box_overlap(const float *box_a, const float *box_b) { // params box_a: [x, y, z, dx, dy, dz, heading] // params box_b: [x, y, z, dx, dy, dz, heading] float a_angle = box_a[6], b_angle = box_b[6]; float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2; float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half; float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half; float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half; float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half; Point center_a(box_a[0], box_a[1]); Point center_b(box_b[0], box_b[1]); Point box_a_corners[5]; box_a_corners[0].set(a_x1, a_y1); box_a_corners[1].set(a_x2, a_y1); box_a_corners[2].set(a_x2, a_y2); box_a_corners[3].set(a_x1, a_y2); Point box_b_corners[5]; box_b_corners[0].set(b_x1, b_y1); box_b_corners[1].set(b_x2, b_y1); box_b_corners[2].set(b_x2, b_y2); box_b_corners[3].set(b_x1, b_y2); // get oriented corners float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); for (int k = 0; k < 4; k++) { rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); } box_a_corners[4] = box_a_corners[0]; box_b_corners[4] = box_b_corners[0]; // get intersection of lines Point cross_points[16]; Point poly_center; int cnt = 0, flag = 0; poly_center.set(0, 0); for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]); if (flag) { poly_center = poly_center + cross_points[cnt]; cnt++; } } } // check corners for (int k = 0; k < 4; k++) { if (check_in_box2d(box_a, box_b_corners[k])) { poly_center = poly_center + box_b_corners[k]; cross_points[cnt] = box_b_corners[k]; cnt++; } if (check_in_box2d(box_b, box_a_corners[k])) { poly_center = poly_center + box_a_corners[k]; cross_points[cnt] = box_a_corners[k]; cnt++; } } poly_center.x /= cnt; poly_center.y /= cnt; // sort the points of polygon Point temp; for (int j = 0; j < cnt - 1; j++) { for (int i = 0; i < cnt - j - 1; i++) { if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) { temp = cross_points[i]; cross_points[i] = cross_points[i + 1]; cross_points[i + 1] = temp; } } } // get the overlap areas float area = 0; for (int k = 0; k < cnt - 1; k++) { area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); } return fabs(area) / 2.0; } __device__ inline float iou_bev(const float *box_a, const float *box_b) { // params box_a: [x, y, z, dx, dy, dz, heading] // params box_b: [x, y, z, dx, dy, dz, heading] float sa = box_a[3] * box_a[4]; float sb = box_b[3] * box_b[4]; float s_overlap = box_overlap(box_a, box_b); return s_overlap / fmaxf(sa + sb - s_overlap, EPS); } __global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel( const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap) { // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) { if (a_idx >= num_a || b_idx >= num_b) { return; } const float *cur_box_a = boxes_a + a_idx * 7; const float *cur_box_b = boxes_b + b_idx * 7; float cur_overlap = box_overlap(cur_box_a, cur_box_b); ans_overlap[a_idx * num_b + b_idx] = cur_overlap; } } __global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num, const float nms_overlap_thresh, const float *boxes, unsigned long long *mask) { // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int blocks = (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { // if (row_start > col_start) return; const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const float *cur_box = boxes + cur_box_idx * 7; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; mask[cur_box_idx * col_blocks + col_start] = t; } } } __device__ inline float iou_normal(float const *const a, float const *const b) { // params: a: [x, y, z, dx, dy, dz, heading] // params: b: [x, y, z, dx, dy, dz, heading] float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2); float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2); float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); float interS = width * height; float Sa = a[3] * a[4]; float Sb = b[3] * b[4]; return interS / fmaxf(Sa + Sb - interS, EPS); } __global__ void iou3d_nms3d_normal_forward_cuda_kernel( const int boxes_num, const float nms_overlap_thresh, const float *boxes, unsigned long long *mask) { // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int blocks = (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { // if (row_start > col_start) return; const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const float *cur_box = boxes + cur_box_idx * 7; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; mask[cur_box_idx * col_blocks + col_start] = t; } } } #endif // IOU3D_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap #ifndef KNN_CUDA_KERNEL_CUH #define KNN_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif inline __device__ void swap_float(float *x, float *y) { float tmp = *x; *x = *y; *y = tmp; } inline __device__ void swap_int(int *x, int *y) { int tmp = *x; *x = *y; *y = tmp; } __device__ void reheap(float *dist, int *idx, int k) { int root = 0; int child = root * 2 + 1; while (child < k) { if (child + 1 < k && dist[child + 1] > dist[child]) child++; if (dist[root] > dist[child]) return; swap_float(&dist[root], &dist[child]); swap_int(&idx[root], &idx[child]); root = child; child = root * 2 + 1; } } __device__ void heap_sort(float *dist, int *idx, int k) { int i; for (i = k - 1; i > 0; i--) { swap_float(&dist[0], &dist[i]); swap_int(&idx[0], &idx[i]); reheap(dist, idx, i); } } // input: xyz (b, n, 3) new_xyz (b, m, 3) // output: idx (b, m, nsample) dist2 (b, m, nsample) template __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample, const T *xyz, const T *new_xyz, int *__restrict__ idx, T *dist2) { int bs_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, m) { if (bs_idx >= b) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; idx += bs_idx * m * nsample + pt_idx * nsample; dist2 += bs_idx * m * nsample + pt_idx * nsample; T new_x = new_xyz[0]; T new_y = new_xyz[1]; T new_z = new_xyz[2]; float best_dist[100]; int best_idx[100]; for (int i = 0; i < nsample; i++) { best_dist[i] = 1e10; best_idx[i] = 0; } for (int i = 0; i < n; i++) { T x = xyz[i * 3 + 0]; T y = xyz[i * 3 + 1]; T z = xyz[i * 3 + 2]; T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < best_dist[0]) { best_dist[0] = d2; best_idx[0] = i; reheap(best_dist, best_idx, nsample); } } heap_sort(best_dist, best_idx, nsample); for (int i = 0; i < nsample; i++) { idx[i] = best_idx[i]; dist2[i] = best_dist[i]; } } } #endif // KNN_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef MASKED_CONV2D_CUDA_KERNEL_CUH #define MASKED_CONV2D_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void MaskedIm2colForward(const int n, const scalar_t *data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int64_t *mask_h_idx, const int64_t *mask_w_idx, const int mask_cnt, scalar_t *data_col) { // mask_cnt * channels CUDA_1D_KERNEL_LOOP(index, n) { const int m_index = index % mask_cnt; const int h_col = mask_h_idx[m_index]; const int w_col = mask_w_idx[m_index]; const int c_im = index / mask_cnt; const int c_col = c_im * kernel_h * kernel_w; const int h_offset = h_col - pad_h; const int w_offset = w_col - pad_w; scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index; for (int i = 0; i < kernel_h; ++i) { int h_im = h_offset + i; for (int j = 0; j < kernel_w; ++j) { int w_im = w_offset + j; if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { *data_col_ptr = (scalar_t)data_im[(c_im * height + h_im) * width + w_im]; } else { *data_col_ptr = 0.0; } data_col_ptr += mask_cnt; } } } } template __global__ void MaskedCol2imForward(const int n, const scalar_t *data_col, const int height, const int width, const int channels, const int64_t *mask_h_idx, const int64_t *mask_w_idx, const int mask_cnt, scalar_t *data_im) { CUDA_1D_KERNEL_LOOP(index, n) { const int m_index = index % mask_cnt; const int h_im = mask_h_idx[m_index]; const int w_im = mask_w_idx[m_index]; const int c_im = index / mask_cnt; // compute the start and end of the output data_im[(c_im * height + h_im) * width + w_im] = data_col[index]; } } #endif // MASKED_CONV2D_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH #define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #define MAXN 20 __device__ const float PI = 3.1415926; struct Point { float x, y; __device__ Point() {} __device__ Point(float x, float y) : x(x), y(y) {} }; __device__ inline void swap1(Point *a, Point *b) { Point temp; temp.x = a->x; temp.y = a->y; a->x = b->x; a->y = b->y; b->x = temp.x; b->y = temp.y; } __device__ inline float cross(Point o, Point a, Point b) { return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y); } __device__ inline float dis(Point a, Point b) { return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y); } __device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) { float convex_points[2][MAXN]; for (int j = 0; j < n_points; j++) { convex_points[0][j] = ps[j].x; } for (int j = 0; j < n_points; j++) { convex_points[1][j] = ps[j].y; } Point edges[MAXN]; float edges_angles[MAXN]; float unique_angles[MAXN]; int n_edges = n_points - 1; int n_unique = 0; int unique_flag = 0; for (int i = 0; i < n_edges; i++) { edges[i].x = ps[i + 1].x - ps[i].x; edges[i].y = ps[i + 1].y - ps[i].y; } for (int i = 0; i < n_edges; i++) { edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x); if (edges_angles[i] >= 0) { edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2); } else { edges_angles[i] = edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2); } } unique_angles[0] = edges_angles[0]; n_unique += 1; for (int i = 1; i < n_edges; i++) { for (int j = 0; j < n_unique; j++) { if (edges_angles[i] == unique_angles[j]) { unique_flag += 1; } } if (unique_flag == 0) { unique_angles[n_unique] = edges_angles[i]; n_unique += 1; unique_flag = 0; } else { unique_flag = 0; } } float minarea = 1e12; for (int i = 0; i < n_unique; i++) { float R[2][2]; float rot_points[2][MAXN]; R[0][0] = cos(unique_angles[i]); R[0][1] = sin(unique_angles[i]); R[1][0] = -sin(unique_angles[i]); R[1][1] = cos(unique_angles[i]); // R x Points for (int m = 0; m < 2; m++) { for (int n = 0; n < n_points; n++) { float sum = 0.0; for (int k = 0; k < 2; k++) { sum = sum + R[m][k] * convex_points[k][n]; } rot_points[m][n] = sum; } } // xmin; float xmin, ymin, xmax, ymax; xmin = 1e12; for (int j = 0; j < n_points; j++) { if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) { continue; } else { if (rot_points[0][j] < xmin) { xmin = rot_points[0][j]; } } } // ymin ymin = 1e12; for (int j = 0; j < n_points; j++) { if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) { continue; } else { if (rot_points[1][j] < ymin) { ymin = rot_points[1][j]; } } } // xmax xmax = -1e12; for (int j = 0; j < n_points; j++) { if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) { continue; } else { if (rot_points[0][j] > xmax) { xmax = rot_points[0][j]; } } } // ymax ymax = -1e12; for (int j = 0; j < n_points; j++) { if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) { continue; } else { if (rot_points[1][j] > ymax) { ymax = rot_points[1][j]; } } } float area = (xmax - xmin) * (ymax - ymin); if (area < minarea) { minarea = area; minbox[0] = unique_angles[i]; minbox[1] = xmin; minbox[2] = ymin; minbox[3] = xmax; minbox[4] = ymax; } } } // convex_find __device__ inline void Jarvis(Point *in_poly, int &n_poly) { int n_input = n_poly; Point input_poly[20]; for (int i = 0; i < n_input; i++) { input_poly[i].x = in_poly[i].x; input_poly[i].y = in_poly[i].y; } Point p_max, p_k; int max_index, k_index; int Stack[20], top1, top2; // float sign; double sign; Point right_point[10], left_point[10]; for (int i = 0; i < n_poly; i++) { if (in_poly[i].y < in_poly[0].y || in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { Point *j = &(in_poly[0]); Point *k = &(in_poly[i]); swap1(j, k); } if (i == 0) { p_max = in_poly[0]; max_index = 0; } if (in_poly[i].y > p_max.y || in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { p_max = in_poly[i]; max_index = i; } } if (max_index == 0) { max_index = 1; p_max = in_poly[max_index]; } k_index = 0, Stack[0] = 0, top1 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > dis(in_poly[Stack[top1]], p_k)))) { p_k = in_poly[i]; k_index = i; } } top1++; Stack[top1] = k_index; } for (int i = 0; i <= top1; i++) { right_point[i] = in_poly[Stack[i]]; } k_index = 0, Stack[0] = 0, top2 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > dis(in_poly[Stack[top2]], p_k))) { p_k = in_poly[i]; k_index = i; } } top2++; Stack[top2] = k_index; } for (int i = top2 - 1; i >= 0; i--) { left_point[i] = in_poly[Stack[i]]; } for (int i = 0; i < top1 + top2; i++) { if (i <= top1) { in_poly[i] = right_point[i]; } else { in_poly[i] = left_point[top2 - (i - top1)]; } } n_poly = top1 + top2; } template __device__ inline void Findminbox(T const *const p, T *minpoints) { Point ps1[MAXN]; Point convex[MAXN]; for (int i = 0; i < 9; i++) { convex[i].x = p[i * 2]; convex[i].y = p[i * 2 + 1]; } int n_convex = 9; Jarvis(convex, n_convex); int n1 = n_convex; for (int i = 0; i < n1; i++) { ps1[i].x = convex[i].x; ps1[i].y = convex[i].y; } ps1[n1].x = convex[0].x; ps1[n1].y = convex[0].y; float minbbox[5] = {0}; minBoundingRect(ps1, n1 + 1, minbbox); float angle = minbbox[0]; float xmin = minbbox[1]; float ymin = minbbox[2]; float xmax = minbbox[3]; float ymax = minbbox[4]; float R[2][2]; R[0][0] = cos(angle); R[0][1] = sin(angle); R[1][0] = -sin(angle); R[1][1] = cos(angle); minpoints[0] = xmax * R[0][0] + ymin * R[1][0]; minpoints[1] = xmax * R[0][1] + ymin * R[1][1]; minpoints[2] = xmin * R[0][0] + ymin * R[1][0]; minpoints[3] = xmin * R[0][1] + ymin * R[1][1]; minpoints[4] = xmin * R[0][0] + ymax * R[1][0]; minpoints[5] = xmin * R[0][1] + ymax * R[1][1]; minpoints[6] = xmax * R[0][0] + ymax * R[1][0]; minpoints[7] = xmax * R[0][1] + ymax * R[1][1]; } template __global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes, const T *ex_boxes, T *minbox) { CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { const T *cur_box = ex_boxes + index * 18; T *cur_min_box = minbox + index * 8; Findminbox(cur_box, cur_min_box); } } #endif // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer ***************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, *this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ********************* * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.cuh * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng */ // modified from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu #ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH #define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH #include #ifdef MMCV_WITH_TRT #include "common_cuda_helper.hpp" #else // MMCV_WITH_TRT #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else // MMCV_USE_PARROTS #include "pytorch_cuda_helper.hpp" #endif // MMCV_USE_PARROTS #endif // MMCV_WITH_TRT template __device__ T dmcn_im2col_bilinear(const T *input, const int data_width, const int height, const int width, T h, T w) { int h_low = floorf(h); int w_low = floorf(w); int h_high = h_low + 1; int w_high = w_low + 1; T lh = h - h_low; T lw = w - w_low; T hh = 1 - lh, hw = 1 - lw; T v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low]; T v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = input[h_low * data_width + w_high]; T v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = input[h_high * data_width + w_low]; T v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = input[h_high * data_width + w_high]; T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floorf(argmax_h); int argmax_w_low = floorf(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; T weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w, const int height, const int width, const T *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floorf(argmax_h); int argmax_w_low = floorf(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; T weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void modulated_deformable_im2col_gpu_kernel( const int n, const T *data_im, const T *data_offset, const T *data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, T *data_col) { CUDA_1D_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; T *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; const T *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const T *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const T *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; const T mask = data_mask_ptr[data_mask_hw_ptr]; T val = static_cast(0); const T h_im = h_in + i * dilation_h + offset_h; const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); *data_col_ptr = val * mask; data_col_ptr += batch_size * height_col * width_col; } } } } template __global__ void modulated_deformable_col2im_gpu_kernel( const int n, const T *data_col, const T *data_offset, const T *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, T *grad_im) { CUDA_1D_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const T *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const T *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; const T mask = data_mask_ptr[data_mask_hw_ptr]; const T cur_inv_h_data = h_in + i * dilation_h + offset_h; const T cur_inv_w_data = w_in + j * dilation_w + offset_w; const T cur_top_grad = data_col[index] * mask; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; T weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } template __global__ void modulated_deformable_col2im_coord_gpu_kernel( const int n, const T *data_col, const T *data_im, const T *data_offset, const T *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, T *grad_offset, T *grad_mask) { CUDA_1D_KERNEL_LOOP(index, n) { T val = 0, mval = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const T *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const T *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const T *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const T *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; const T mask = data_mask_ptr[data_mask_hw_ptr]; T inv_h = h_in + i * dilation_h + offset_h; T inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) inv_h = inv_w = -2; else mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); const T weight = dmcn_get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos] * mask; cnt += 1; } // KERNEL_ASSIGN(grad_offset[index], offset_req, val); grad_offset[index] = val; if (offset_c % 2 == 0) // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * // height_col + h) * width_col + w], mask_req, mval); grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; } } #endif // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #ifndef DEFORM_ATTN_CUDA_KERNEL #define DEFORM_ATTN_CUDA_KERNEL #include "common_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp" template __device__ scalar_t ms_deform_attn_im2col_bilinear( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c) { const int h_low = floorf(h); const int w_low = floorf(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void ms_deform_attn_col2im_bilinear( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t *&grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { const int h_low = floorf(h); const int w_low = floorf(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value + ptr1, w1 * top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value + ptr2, w2 * top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value + ptr3, w3 * top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value + ptr4, w4 * top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_attn_weight = top_grad * val; *grad_sampling_loc = width * grad_w_weight * top_grad_value; *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; } template __device__ void ms_deform_attn_col2im_bilinear_gm( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t *&grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { const int h_low = floorf(h); const int w_low = floorf(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value + ptr1, w1 * top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value + ptr2, w2 * top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value + ptr3, w3 * top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value + ptr4, w4 * top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } template __global__ void ms_deformable_im2col_gpu_kernel( const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *data_col) { CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; scalar_t *data_col_ptr = data_col + index; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; scalar_t col = 0; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; const int qid_stride = num_heads * channels; CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w = cache_grad_sampling_loc[0], _grad_h = cache_grad_sampling_loc[1], _grad_a = cache_grad_attn_weight[0]; int sid = 2; for (unsigned int _tid = 1; _tid < blockSize; ++_tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[_tid]; sid += 2; } *grad_sampling_loc_out = _grad_w; *(grad_sampling_loc_out + 1) = _grad_h; *grad_attn_weight_out = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_sampling_loc_out = cache_grad_sampling_loc[0]; *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight_out = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w = cache_grad_sampling_loc[0], _grad_h = cache_grad_sampling_loc[1], _grad_a = cache_grad_attn_weight[0]; int sid = 2; for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[_tid]; sid += 2; } *grad_sampling_loc_out = _grad_w; *(grad_sampling_loc_out + 1) = _grad_h; *grad_attn_weight_out = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_sampling_loc_out = cache_grad_sampling_loc[0]; *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight_out = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_gm( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { CUDA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, grad_sampling_loc_out, grad_attn_weight_out); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } #endif // DEFORM_ATTN_CUDA_KERNEL ================================================ FILE: mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef NMS_CUDA_KERNEL_CUH #define NMS_CUDA_KERNEL_CUH #include #ifdef MMCV_WITH_TRT #include "common_cuda_helper.hpp" #else // MMCV_WITH_TRT #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else // MMCV_USE_PARROTS #include "pytorch_cuda_helper.hpp" #endif // MMCV_USE_PARROTS #endif // MMCV_WITH_TRT int const threadsPerBlock = sizeof(unsigned long long int) * 8; __device__ inline bool devIoU(float const *const a, float const *const b, const int offset, const float threshold) { float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); float width = fmaxf(right - left + offset, 0.f), height = fmaxf(bottom - top + offset, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset); float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset); return interS > threshold * (Sa + Sb - interS); } __global__ static void nms_cuda(const int n_boxes, const float iou_threshold, const int offset, const float *dev_boxes, unsigned long long *dev_mask) { int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock; CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { const int tid = threadIdx.x; if (row_start > col_start) return; const int row_size = fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 4]; if (tid < col_size) { block_boxes[tid * 4 + 0] = dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0]; block_boxes[tid * 4 + 1] = dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1]; block_boxes[tid * 4 + 2] = dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2]; block_boxes[tid * 4 + 3] = dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3]; } __syncthreads(); if (tid < row_size) { const int cur_box_idx = threadsPerBlock * row_start + tid; const float *cur_box = dev_boxes + cur_box_idx * 4; int i = 0; unsigned long long int t = 0; int start = 0; if (row_start == col_start) { start = tid + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) { t |= 1ULL << i; } } dev_mask[cur_box_idx * gridDim.y + col_start] = t; } } } __global__ static void gather_keep_from_mask(bool *keep, const unsigned long long *dev_mask, const int n_boxes) { const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock; const int tid = threadIdx.x; // mark the bboxes which have been removed. extern __shared__ unsigned long long removed[]; // initialize removed. for (int i = tid; i < col_blocks; i += blockDim.x) { removed[i] = 0; } __syncthreads(); for (int nblock = 0; nblock < col_blocks; ++nblock) { auto removed_val = removed[nblock]; __syncthreads(); const int i_offset = nblock * threadsPerBlock; #pragma unroll for (int inblock = 0; inblock < threadsPerBlock; ++inblock) { const int i = i_offset + inblock; if (i >= n_boxes) break; // select a candidate, check if it should kept. if (!(removed_val & (1ULL << inblock))) { if (tid == 0) { // mark the output. keep[i] = true; } auto p = dev_mask + i * col_blocks; // remove all bboxes which overlap the candidate. for (int j = tid; j < col_blocks; j += blockDim.x) { if (j >= nblock) removed[j] |= p[j]; } __syncthreads(); removed_val = removed[nblock]; } } } } #endif // NMS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/nms_quadri_cuda.cuh ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved #ifndef NMS_QUADRI_CUDA_CUH #define NMS_QUADRI_CUDA_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #include "box_iou_rotated_utils.hpp" __host__ __device__ inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } namespace { int const threadsPerBlock = sizeof(unsigned long long) * 8; } template __global__ void nms_quadri_cuda_kernel(const int n_boxes, const float iou_threshold, const T* dev_boxes, unsigned long long* dev_mask, const int multi_label) { if (multi_label == 1) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 8 values // (x1, y1, ..., x4, y4) here. __shared__ T block_boxes[threadsPerBlock * 8]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 8 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0]; block_boxes[threadIdx.x * 8 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1]; block_boxes[threadIdx.x * 8 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2]; block_boxes[threadIdx.x * 8 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3]; block_boxes[threadIdx.x * 8 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4]; block_boxes[threadIdx.x * 8 + 5] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5]; block_boxes[threadIdx.x * 8 + 6] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6]; block_boxes[threadIdx.x * 8 + 7] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 9; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_quadri function from // box_iou_rotated_utils.h if (single_box_iou_quadri(cur_box, block_boxes + i * 8, 0) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = divideUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } else { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 8 values // (x1, y1, , ..., x4, y4) here. __shared__ T block_boxes[threadsPerBlock * 8]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 8 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0]; block_boxes[threadIdx.x * 8 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1]; block_boxes[threadIdx.x * 8 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2]; block_boxes[threadIdx.x * 8 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3]; block_boxes[threadIdx.x * 8 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4]; block_boxes[threadIdx.x * 8 + 5] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5]; block_boxes[threadIdx.x * 8 + 6] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6]; block_boxes[threadIdx.x * 8 + 7] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 8; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_quadri function from // box_iou_rotated_utils.h if (single_box_iou_quadri(cur_box, block_boxes + i * 8, 0) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = divideUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } } #endif ================================================ FILE: mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // modified from // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu #ifndef NMS_ROTATED_CUDA_CUH #define NMS_ROTATED_CUDA_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #include "box_iou_rotated_utils.hpp" __host__ __device__ inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } namespace { int const threadsPerBlock = sizeof(unsigned long long) * 8; } template __global__ void nms_rotated_cuda_kernel(const int n_boxes, const float iou_threshold, const T* dev_boxes, unsigned long long* dev_mask, const int multi_label) { // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel if (multi_label == 1) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 5 values // (x_center, y_center, width, height, angle_degrees) here. __shared__ T block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 6; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_rotated function from // box_iou_rotated_utils.h if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = divideUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } else { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 5 values // (x_center, y_center, width, height, angle_degrees) here. __shared__ T block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_rotated function from // box_iou_rotated_utils.h if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = divideUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } } #endif ================================================ FILE: mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh ================================================ /* * Copyright (c) 2019, SenseTime. */ #ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_ #define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_ #ifndef __CUDACC__ #error cudawarpfunction.cuh should only be included by .cu files #endif #include #include #ifdef PARROTS_USE_HALF #include #endif #ifdef __CUDA_ARCH__ #define CUDA_INTRINSIC_FUNC(Expr) Expr #else #define CUDA_INTRINSIC_FUNC(Expr) #endif #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 #ifdef PARROTS_USE_HALF #if CUDA_VERSION < 9000 __device__ inline float16 __shfl(float16 var, int srcLane, int width) { CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width);); } __device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) { CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width);); } __device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) { CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width);); } __device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) { CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width);); } #else // CUDA_VERSION >= 9000 __device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane, int width = warpSize) { CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width); return r;); } __device__ inline float16 __shfl_up_sync(unsigned mask, float16 var, unsigned delta, int width = warpSize) { CUDA_INTRINSIC_FUNC( float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;); } __device__ inline float16 __shfl_down_sync(unsigned mask, float16 var, unsigned delta, int width = warpSize) { CUDA_INTRINSIC_FUNC( float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;); } __device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var, int laneMask, int width) { CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_xor_sync(mask, var.y, laneMask, width); return r;); } #endif // CUDA_VERSION < 9000 #endif // PARROTS_USE_HALF // warp shuffle interface with a dummy mask #if CUDA_VERSION < 9000 template __device__ inline T __shfl_sync(unsigned mask, T var, int srcLane, int width = warpSize) { CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width);); } template __device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta, int width = warpSize) { CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width);); } template __device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta, int width = warpSize) { CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width);); } template __device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask, int width = warpSize) { CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width);); } #endif // CUDA_VERSION < 9000 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 #endif // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_ ================================================ FILE: mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH #define POINT_IN_BOXES_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, T &local_x, T &local_y) { T cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } template __device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, T &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, // cz in the bottom center T x = pt[0], y = pt[1], z = pt[2]; T cx = box3d[0], cy = box3d[1], cz = box3d[2]; T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > z_size / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); return in_flag; } template __global__ void points_in_boxes_part_forward_cuda_kernel( int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts, int *box_idx_of_points) { // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // coordinate, z is the bottom center, each box DO NOT overlaps params pts: // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points: // (B, npoints), default -1 int bs_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { if (bs_idx >= batch_size) return; boxes += bs_idx * boxes_num * 7; pts += bs_idx * pts_num * 3 + pt_idx * 3; box_idx_of_points += bs_idx * pts_num + pt_idx; T local_x = 0, local_y = 0; int cur_in_flag = 0; for (int k = 0; k < boxes_num; k++) { cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); if (cur_in_flag) { box_idx_of_points[0] = k; break; } } } } template __global__ void points_in_boxes_all_forward_cuda_kernel( int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts, int *box_idx_of_points) { // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // coordinate, z is the bottom center, each box DO NOT overlaps params pts: // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points: // (B, npoints), default -1 int bs_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { if (bs_idx >= batch_size) return; boxes += bs_idx * boxes_num * 7; pts += bs_idx * pts_num * 3 + pt_idx * 3; box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num; T local_x = 0, local_y = 0; for (int k = 0; k < boxes_num; k++) { const int cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); if (cur_in_flag) { box_idx_of_points[k] = 1; } } } } #endif // POINT_IN_BOXES_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH #define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif struct point { float x, y; }; template __global__ void points_in_polygons_forward_cuda_kernel( const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2, const int rows, const int cols, scalar_t *inside_flag) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int row = index / cols; int col = index % cols; const scalar_t *offset_vertex1 = vertex1 + row * 2; const scalar_t *offset_vertex2 = vertex2 + col * 8; point point_[1]; point polygon[4]; point_[0].x = offset_vertex1[0]; point_[0].y = offset_vertex1[1]; polygon[0].x = offset_vertex2[0]; polygon[0].y = offset_vertex2[1]; polygon[1].x = offset_vertex2[2]; polygon[1].y = offset_vertex2[3]; polygon[2].x = offset_vertex2[4]; polygon[2].y = offset_vertex2[5]; polygon[3].x = offset_vertex2[6]; polygon[3].y = offset_vertex2[7]; int nCross = 0; int i, j; float sx, sy, tx, ty, px, py, x; for (i = 0, j = 3; i < 4; j = i, i++) { sx = polygon[i].x; sy = polygon[i].y; tx = polygon[j].x; ty = polygon[j].y; px = point_[0].x; py = point_[0].y; if (py < min(sy, ty)) continue; if (py > max(sy, ty)) continue; if ((sx == px && sy == py) || (tx == px && ty == py)) { break; } else { if ((sy < py && ty >= py) || (sy >= py && ty < py)) { x = sx + (py - sy) * (tx - sx) / (ty - sy); if (x == px) { break; } if (x > px) { nCross++; } } } } if (nCross % 2 == 1) { inside_flag[index] = 1.0; } else { inside_flag[index] = 0.0; } return; } } #endif // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu // Distributed under terms of the MIT license. #ifndef PRROI_POOL_CUDA_KERNEL_CUH #define PRROI_POOL_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __device__ static __forceinline__ T PrRoIPoolingGetData(const T *data, const int h, const int w, const int height, const int width) { bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); T retVal = overflow ? 0.0f : data[h * width + w]; return retVal; } template __device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) { return (1.0f - abs(dh)) * (1.0f - abs(dw)); } template __device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t, T c1, T c2) { return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1; } template __device__ static T PrRoIPoolingInterpolation(const T *data, const T h, const T w, const int height, const int width) { T retVal = 0.0f; int h1 = floorf(h); int w1 = floorf(w); retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); h1 = floorf(h) + 1; w1 = floorf(w); retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); h1 = floorf(h); w1 = floorf(w) + 1; retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); h1 = floorf(h) + 1; w1 = floorf(w) + 1; retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); return retVal; } template __device__ static T PrRoIPoolingMatCalculation(const T *this_data, const int s_h, const int s_w, const int e_h, const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0, const int w0) { T alpha, beta, lim_alpha, lim_beta, tmp; T sum_out = 0; alpha = x0 - T(s_w); beta = y0 - T(s_h); lim_alpha = x1 - T(s_w); lim_beta = y1 - T(s_h); tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp; alpha = T(e_w) - x1; lim_alpha = T(e_w) - x0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp; alpha = x0 - T(s_w); beta = T(e_h) - y1; lim_alpha = x1 - T(s_w); lim_beta = T(e_h) - y0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp; alpha = T(e_w) - x1; lim_alpha = T(e_w) - x0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp; return sum_out; } template __device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff, const int h, const int w, const int height, const int width, const T coeff) { bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff); } template __device__ static void PrRoIPoolingMatDistributeDiff( T *diff, const T top_diff, const int s_h, const int s_w, const int e_h, const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0, const int w0) { T alpha, beta, lim_alpha, lim_beta, tmp; alpha = x0 - T(s_w); beta = y0 - T(s_h); lim_alpha = x1 - T(s_w); lim_beta = y1 - T(s_h); tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp); alpha = T(e_w) - x1; lim_alpha = T(e_w) - x0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp); alpha = x0 - T(s_w); beta = T(e_h) - y1; lim_alpha = x1 - T(s_w); lim_beta = T(e_h) - y0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp); alpha = T(e_w) - x1; lim_alpha = T(e_w) - x0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp); } template __global__ void prroi_pool_forward_cuda_kernel( const int nthreads, const T *input, const T *rois, T *output, const int pooled_height, const int pooled_width, const T spatial_scale, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T *offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; T roi_x1 = offset_rois[1] * spatial_scale; T roi_y1 = offset_rois[2] * spatial_scale; T roi_x2 = offset_rois[3] * spatial_scale; T roi_y2 = offset_rois[4] * spatial_scale; T roi_width = max(roi_x2 - roi_x1, ((T)0.0)); T roi_height = max(roi_y2 - roi_y1, ((T)0.0)); T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); const T *this_data = input + (roi_batch_ind * channels + c) * height * width; T *this_out = output + index; T bin_x1 = roi_x1 + bin_size_w * pw; T bin_y1 = roi_y1 + bin_size_h * ph; T bin_x2 = bin_x1 + bin_size_w; T bin_y2 = bin_y1 + bin_size_h; T bin_size = max(T(0.0), bin_size_w * bin_size_h); if (bin_size == 0) { *this_out = 0; continue; } T sum_out = 0; int start_x, start_y, end_x, end_y; start_x = floorf(bin_x1); end_x = ceilf(bin_x2); start_y = floorf(bin_y1); end_y = ceilf(bin_y2); for (int bin_x = start_x; bin_x < end_x; ++bin_x) for (int bin_y = start_y; bin_y < end_y; ++bin_y) sum_out += PrRoIPoolingMatCalculation( this_data, bin_y, bin_x, bin_y + 1, bin_x + 1, max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)), min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height, width); *this_out = sum_out / bin_size; } } template __global__ void prroi_pool_backward_cuda_kernel( const int nthreads, const T *grad_output, const T *rois, T *grad_input, const int pooled_height, const int pooled_width, const T spatial_scale, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; auto rois_cur = rois + n * 5; int roi_batch_ind = rois_cur[0]; T roi_x1 = rois_cur[1] * spatial_scale; T roi_y1 = rois_cur[2] * spatial_scale; T roi_x2 = rois_cur[3] * spatial_scale; T roi_y2 = rois_cur[4] * spatial_scale; T roi_width = max(roi_x2 - roi_x1, (T)0); T roi_height = max(roi_y2 - roi_y1, (T)0); T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); const T *this_out_grad = grad_output + index; T *this_data_grad = grad_input + (roi_batch_ind * channels + c) * height * width; T bin_x1 = roi_x1 + bin_size_w * pw; T bin_y1 = roi_y1 + bin_size_h * ph; T bin_x2 = bin_x1 + bin_size_w; T bin_y2 = bin_y1 + bin_size_h; T bin_size = max(T(0.0), bin_size_w * bin_size_h); T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size; int start_x, start_y, end_x, end_y; start_x = floorf(bin_x1); end_x = ceilf(bin_x2); start_y = floorf(bin_y1); end_y = ceilf(bin_y2); for (int bin_x = start_x; bin_x < end_x; ++bin_x) for (int bin_y = start_y; bin_y < end_y; ++bin_y) PrRoIPoolingMatDistributeDiff( this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1, max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)), min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height, width); } } template __global__ void prroi_pool_coor_backward_cuda_kernel( const int nthreads, const T *output, const T *grad_output, const T *input, const T *rois, T *grad_rois, const int pooled_height, const int pooled_width, const T spatial_scale, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; auto rois_cur = rois + n * 5; int roi_batch_ind = rois_cur[0]; T roi_x1 = rois_cur[1] * spatial_scale; T roi_y1 = rois_cur[2] * spatial_scale; T roi_x2 = rois_cur[3] * spatial_scale; T roi_y2 = rois_cur[4] * spatial_scale; T roi_width = max(roi_x2 - roi_x1, (T)0); T roi_height = max(roi_y2 - roi_y1, (T)0); T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); const T output_grad_val = grad_output[index]; const T *this_input_data = input + (roi_batch_ind * channels + c) * height * width; const T output_val = output[index]; T *this_rois_grad = grad_rois + n * 5; T bin_x1 = roi_x1 + bin_size_w * pw; T bin_y1 = roi_y1 + bin_size_h * ph; T bin_x2 = bin_x1 + bin_size_w; T bin_y2 = bin_y1 + bin_size_h; T bin_size = max(T(0.0), bin_size_w * bin_size_h); T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size; // WARNING: to be discussed if (sum_out == 0) continue; int start_x, start_y, end_x, end_y; start_x = floorf(bin_x1); end_x = ceilf(bin_x2); start_y = floorf(bin_y1); end_y = ceilf(bin_y2); T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0; for (int bin_y = start_y; bin_y < end_y; ++bin_y) { grad_x1_y += PrRoIPoolingSingleCoorIntegral( max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y, PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1, height, width), PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1, height, width)); grad_x2_y += PrRoIPoolingSingleCoorIntegral( max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y, PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2, height, width), PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2, height, width)); } for (int bin_x = start_x; bin_x < end_x; ++bin_x) { grad_x_y1 += PrRoIPoolingSingleCoorIntegral( max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x, PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x), height, width), PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1), height, width)); grad_x_y2 += PrRoIPoolingSingleCoorIntegral( max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x, PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x), height, width), PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1), height, width)); } T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val; T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val; T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val; T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val; partial_x1 = partial_x1 / bin_size * spatial_scale; partial_x2 = partial_x2 / bin_size * spatial_scale; partial_y1 = partial_y1 / bin_size * spatial_scale; partial_y2 = partial_y2 / bin_size * spatial_scale; // (index, x1, y1, x2, y2) this_rois_grad[0] = 0; atomicAdd(this_rois_grad + 1, (partial_x1 * (1.0f - T(pw) / pooled_width) + partial_x2 * (1.0f - T(pw + 1) / pooled_width)) * output_grad_val); atomicAdd(this_rois_grad + 2, (partial_y1 * (1.0f - T(ph) / pooled_height) + partial_y2 * (1.0f - T(ph + 1) / pooled_height)) * output_grad_val); atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width + partial_x1 * T(pw) / pooled_width) * output_grad_val); atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height + partial_y1 * T(ph) / pooled_height) * output_grad_val); } } #endif // ROI_POOL_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef PSAMASK_CUDA_KERNEL_CUH #define PSAMASK_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif // CUDA: grid stride looping #ifndef CUDA_KERNEL_LOOP #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) #endif template __global__ void psamask_collect_forward_cuda( const int nthreads, const int h_feature, const int w_feature, const int h_mask, const int w_mask, const int half_h_mask, const int half_w_mask, const T* mask_data, T* buffer_data) { CUDA_KERNEL_LOOP(index, nthreads) { const int w = index % w_feature; const int h = (index / w_feature) % h_feature; const int n = index / w_feature / h_feature; // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed const int hstart = max(0, half_h_mask - h); const int hend = min(h_mask, h_feature + half_h_mask - h); const int wstart = max(0, half_w_mask - w); const int wend = min(w_mask, w_feature + half_w_mask - w); // (hidx, widx ) with mask-indexed // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed for (int hidx = hstart; hidx < hend; hidx++) { for (int widx = wstart; widx < wend; widx++) { buffer_data[(n * h_feature * w_feature + (hidx + h - half_h_mask) * w_feature + (widx + w - half_w_mask)) * h_feature * w_feature + h * w_feature + w] = mask_data [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * w_feature + w]; } } } } template __global__ void psamask_distribute_forward_cuda( const int nthreads, const int h_feature, const int w_feature, const int h_mask, const int w_mask, const int half_h_mask, const int half_w_mask, const T* mask_data, T* buffer_data) { CUDA_KERNEL_LOOP(index, nthreads) { const int w = index % w_feature; const int h = (index / w_feature) % h_feature; const int n = index / w_feature / h_feature; // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed const int hstart = max(0, half_h_mask - h); const int hend = min(h_mask, h_feature + half_h_mask - h); const int wstart = max(0, half_w_mask - w); const int wend = min(w_mask, w_feature + half_w_mask - w); // (hidx, widx ) with mask-indexed // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed for (int hidx = hstart; hidx < hend; hidx++) { for (int widx = wstart; widx < wend; widx++) { buffer_data[(n * h_feature * w_feature + h * w_feature + w) * h_feature * w_feature + (hidx + h - half_h_mask) * w_feature + (widx + w - half_w_mask)] = mask_data [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * w_feature + w]; } } } } template __global__ void psamask_collect_backward_cuda( const int nthreads, const int h_feature, const int w_feature, const int h_mask, const int w_mask, const int half_h_mask, const int half_w_mask, const T* buffer_diff, T* mask_diff) { CUDA_KERNEL_LOOP(index, nthreads) { const int w = index % w_feature; const int h = (index / w_feature) % h_feature; const int n = index / w_feature / h_feature; // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed const int hstart = max(0, half_h_mask - h); const int hend = min(h_mask, h_feature + half_h_mask - h); const int wstart = max(0, half_w_mask - w); const int wend = min(w_mask, w_feature + half_w_mask - w); // (hidx, widx ) with mask-indexed // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed for (int hidx = hstart; hidx < hend; hidx++) { for (int widx = wstart; widx < wend; widx++) { mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * w_feature + w] = buffer_diff[(n * h_feature * w_feature + (hidx + h - half_h_mask) * w_feature + (widx + w - half_w_mask)) * h_feature * w_feature + h * w_feature + w]; } } } } template __global__ void psamask_distribute_backward_cuda( const int nthreads, const int h_feature, const int w_feature, const int h_mask, const int w_mask, const int half_h_mask, const int half_w_mask, const T* buffer_diff, T* mask_diff) { CUDA_KERNEL_LOOP(index, nthreads) { const int w = index % w_feature; const int h = (index / w_feature) % h_feature; const int n = index / w_feature / h_feature; // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed const int hstart = max(0, half_h_mask - h); const int hend = min(h_mask, h_feature + half_h_mask - h); const int wstart = max(0, half_w_mask - w); const int wend = min(w_mask, w_feature + half_w_mask - w); // (hidx, widx ) with mask-indexed // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed for (int hidx = hstart; hidx < hend; hidx++) { for (int widx = wstart; widx < wend; widx++) { mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * w_feature + w] = buffer_diff[(n * h_feature * w_feature + h * w_feature + w) * h_feature * w_feature + (hidx + h - half_h_mask) * w_feature + (widx + w - half_w_mask)]; } } } } #endif // PSAMASK_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh ================================================ // Modified from // https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu #ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH #define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH #include #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else // MMCV_USE_PARROTS #include "pytorch_cuda_helper.hpp" #endif // MMCV_USE_PARROTS /*** Forward ***/ template __global__ void riroi_align_rotated_forward_cuda_kernel( const int nthreads, const scalar_t *bottom_data, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int num_samples, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int num_orientations, scalar_t *top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int o = (index / pooled_width / pooled_height) % num_orientations; int c = (index / pooled_width / pooled_height / num_orientations) % channels; int n = index / pooled_width / pooled_height / num_orientations / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 6; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale; scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale; scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; scalar_t theta = offset_bottom_rois[5]; // Force malformed ROIs to be 1x1 roi_width = max(roi_width, (scalar_t)1.); roi_height = max(roi_height, (scalar_t)1.); scalar_t bin_size_h = static_cast(roi_height) / static_cast(pooled_height); scalar_t bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // find aligned index scalar_t ind_float = theta * num_orientations / (2 * M_PI); int ind = floorf(ind_float); scalar_t l_var = ind_float - (scalar_t)ind; scalar_t r_var = 1.0 - l_var; // correct start channel ind = (ind + num_orientations) % num_orientations; // rotated channel int ind_rot = (o - ind + num_orientations) % num_orientations; int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations; const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels * num_orientations + c * num_orientations + ind_rot) * height * width; const scalar_t *offset_bottom_data_plus = bottom_data + (roi_batch_ind * channels * num_orientations + c * num_orientations + ind_rot_plus) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (num_samples > 0) ? num_samples : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. if (clockwise) { theta = -theta; // If clockwise, the angle needs to be reversed. } scalar_t roi_start_h = -roi_height / 2.0; scalar_t roi_start_w = -roi_width / 2.0; scalar_t cosscalar_theta = cos(theta); scalar_t sinscalar_theta = sin(theta); // We do average (integral) pooling inside a bin const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 scalar_t output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 const scalar_t yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const scalar_t xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta (counterclockwise) around the center and translate scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; scalar_t val = bilinear_interpolate( offset_bottom_data, height, width, y, x, index); scalar_t val_plus = bilinear_interpolate( offset_bottom_data_plus, height, width, y, x, index); output_val += r_var * val + l_var * val_plus; } } output_val /= count; top_data[index] = output_val; } } /*** Backward ***/ template __global__ void riroi_align_rotated_backward_cuda_kernel( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int num_samples, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int num_orientations, scalar_t *bottom_diff) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int o = (index / pooled_width / pooled_height) % num_orientations; int c = (index / pooled_width / pooled_height / num_orientations) % channels; int n = index / pooled_width / pooled_height / num_orientations / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 6; int roi_batch_ind = offset_bottom_rois[0]; // Do not round scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale; scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale; scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; scalar_t theta = offset_bottom_rois[5]; // Force malformed ROIs to be 1x1 roi_width = max(roi_width, (scalar_t)1.); roi_height = max(roi_height, (scalar_t)1.); scalar_t bin_size_h = static_cast(roi_height) / static_cast(pooled_height); scalar_t bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // find aligned index scalar_t ind_float = theta * num_orientations / (2 * M_PI); int ind = floorf(ind_float); scalar_t l_var = ind_float - (scalar_t)ind; scalar_t r_var = 1.0 - l_var; // correct start channel ind = (ind + num_orientations) % num_orientations; // rotated channel int ind_rot = (o - ind + num_orientations) % num_orientations; int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations; scalar_t *offset_bottom_diff = bottom_diff + (roi_batch_ind * channels * num_orientations + c * num_orientations + ind_rot) * height * width; scalar_t *offset_bottom_diff_plus = bottom_diff + (roi_batch_ind * channels * num_orientations + c * num_orientations + ind_rot_plus) * height * width; int top_offset = (n * channels * num_orientations + c * num_orientations + o) * pooled_height * pooled_width; const scalar_t *offset_top_diff = top_diff + top_offset; const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (num_samples > 0) ? num_samples : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. if (clockwise) { theta = -theta; // If clockwise, the angle needs to be reversed. } scalar_t roi_start_h = -roi_height / 2.0; scalar_t roi_start_w = -roi_width / 2.0; scalar_t cosTheta = cos(theta); scalar_t sinTheta = sin(theta); // We do average (integral) pooling inside a bin const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 const scalar_t yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const scalar_t xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta around the center and translate scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; scalar_t w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); scalar_t g1 = top_diff_this_bin * w1 / count; scalar_t g2 = top_diff_this_bin * w2 / count; scalar_t g3 = top_diff_this_bin * w3 / count; scalar_t g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var); atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var); atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var); atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var); atomicAdd(offset_bottom_diff_plus + y_low * width + x_low, g1 * l_var); atomicAdd(offset_bottom_diff_plus + y_low * width + x_high, g2 * l_var); atomicAdd(offset_bottom_diff_plus + y_high * width + x_low, g3 * l_var); atomicAdd(offset_bottom_diff_plus + y_high * width + x_high, g4 * l_var); } // if } // ix } // iy } // CUDA_1D_KERNEL_LOOP } // RiRoIAlignBackward #endif // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ROI_ALIGN_CUDA_KERNEL_CUH #define ROI_ALIGN_CUDA_KERNEL_CUH #include #ifdef MMCV_WITH_TRT #include "common_cuda_helper.hpp" #else // MMCV_WITH_TRT #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else // MMCV_USE_PARROTS #include "pytorch_cuda_helper.hpp" #endif // MMCV_USE_PARROTS #endif // MMCV_WITH_TRT /*** Forward ***/ template __global__ void roi_align_forward_cuda_kernel( const int nthreads, const T* input, const T* rois, T* output, T* argmax_y, T* argmax_x, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, const int pool_mode, // 0 - max pool, 1 - avg pool const bool aligned, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; // Do not using rounding; this implementation detail is critical T offset = aligned ? (T)0.5 : (T)0.0; T roi_start_w = offset_rois[1] * spatial_scale - offset; T roi_start_h = offset_rois[2] * spatial_scale - offset; T roi_end_w = offset_rois[3] * spatial_scale - offset; T roi_end_h = offset_rois[4] * spatial_scale - offset; T roi_width = roi_end_w - roi_start_w; T roi_height = roi_end_h - roi_start_h; if (!aligned) { // for backward-compatibility only roi_width = max(roi_width, (T)1.); roi_height = max(roi_height, (T)1.); } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* offset_input = input + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_height / pooled_height)); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_width / pooled_width)); if (pool_mode == 0) { // We do max pooling inside a bin T maxval = -FLT_MAX; T maxidx_y = -1.f, maxidx_x = -1.f; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_input, height, width, y, x, index); if (val > maxval) { maxval = val; maxidx_y = y; maxidx_x = x; } } } output[index] = maxval; argmax_y[index] = maxidx_y; argmax_x[index] = maxidx_x; } else if (pool_mode == 1) { // We do average pooling inside a bin const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_input, height, width, y, x, index); output_val += val; } } output[index] = output_val / count; } } } /*** Backward ***/ template __global__ void roi_align_backward_cuda_kernel( const int nthreads, const T* grad_output, const T* rois, const T* argmax_y, const T* argmax_x, T* grad_input, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, const int pool_mode, // 0 - max pool, 1 - avg pool const bool aligned, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T grad_output_this_bin = grad_output[index]; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; T* offset_grad_input = grad_input + ((roi_batch_ind * channels + c) * height * width); if (pool_mode == 0) { T y = argmax_y[index], x = argmax_x[index]; if (y != -1.f) { T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_grad_input + y_low * width + x_low, grad_output_this_bin * w1); atomicAdd(offset_grad_input + y_low * width + x_high, grad_output_this_bin * w2); atomicAdd(offset_grad_input + y_high * width + x_low, grad_output_this_bin * w3); atomicAdd(offset_grad_input + y_high * width + x_high, grad_output_this_bin * w4); } } } else if (pool_mode == 1) { // Do not using rounding; this implementation detail is critical T offset = aligned ? (T)0.5 : (T)0.0; T roi_start_w = offset_rois[1] * spatial_scale - offset; T roi_start_h = offset_rois[2] * spatial_scale - offset; T roi_end_w = offset_rois[3] * spatial_scale - offset; T roi_end_h = offset_rois[4] * spatial_scale - offset; T roi_width = roi_end_w - roi_start_w; T roi_height = roi_end_h - roi_start_h; if (!aligned) { // for backward-compatibility only roi_width = max(roi_width, (T)1.); roi_height = max(roi_height, (T)1.); } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_height / pooled_height)); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_width / pooled_width)); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_grad_input + y_low * width + x_low, grad_output_this_bin * w1 / count); atomicAdd(offset_grad_input + y_low * width + x_high, grad_output_this_bin * w2 / count); atomicAdd(offset_grad_input + y_high * width + x_low, grad_output_this_bin * w3 / count); atomicAdd(offset_grad_input + y_high * width + x_high, grad_output_this_bin * w4 / count); } } } } } } #endif // ROI_ALIGN_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh ================================================ // Modified from // https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved #ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH #define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH #include #ifdef MMCV_WITH_TRT #include "common_cuda_helper.hpp" #else // MMCV_WITH_TRT #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else // MMCV_USE_PARROTS #include "pytorch_cuda_helper.hpp" #endif // MMCV_USE_PARROTS #endif // MMCV_WITH_TRT /*** Forward ***/ template __global__ void roi_align_rotated_forward_cuda_kernel( const int nthreads, const scalar_t *bottom_data, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int sampling_ratio, const bool aligned, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, scalar_t *top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 6; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; scalar_t theta = offset_bottom_rois[5]; if (clockwise) { theta = -theta; // If clockwise, the angle needs to be reversed. } if (!aligned) { // for backward-compatibility only // Force malformed ROIs to be 1x1 roi_width = max(roi_width, (scalar_t)1.); roi_height = max(roi_height, (scalar_t)1.); } scalar_t bin_size_h = static_cast(roi_height) / static_cast(pooled_height); scalar_t bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. scalar_t roi_start_h = -roi_height / 2.0; scalar_t roi_start_w = -roi_width / 2.0; scalar_t cosscalar_theta = cos(theta); scalar_t sinscalar_theta = sin(theta); // We do average (integral) pooling inside a bin const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 scalar_t output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 const scalar_t yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const scalar_t xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta (counterclockwise) around the center and translate scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; scalar_t val = bilinear_interpolate( offset_bottom_data, height, width, y, x, index); output_val += val; } } output_val /= count; top_data[index] = output_val; } } /*** Backward ***/ template __global__ void roi_align_rotated_backward_cuda_kernel( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int sampling_ratio, const bool aligned, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, scalar_t *bottom_diff) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 6; int roi_batch_ind = offset_bottom_rois[0]; // Do not round scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; scalar_t theta = offset_bottom_rois[5]; if (clockwise) { theta = -theta; // If clockwise, the angle needs to be reversed. } if (!aligned) { // for backward-compatibility only // Force malformed ROIs to be 1x1 roi_width = max(roi_width, (scalar_t)1.); roi_height = max(roi_height, (scalar_t)1.); } scalar_t bin_size_h = static_cast(roi_height) / static_cast(pooled_height); scalar_t bin_size_w = static_cast(roi_width) / static_cast(pooled_width); scalar_t *offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const scalar_t *offset_top_diff = top_diff + top_offset; const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. scalar_t roi_start_h = -roi_height / 2.0; scalar_t roi_start_w = -roi_width / 2.0; scalar_t cosTheta = cos(theta); scalar_t sinTheta = sin(theta); // We do average (integral) pooling inside a bin const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 const scalar_t yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const scalar_t xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta around the center and translate scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; scalar_t w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); scalar_t g1 = top_diff_this_bin * w1 / count; scalar_t g2 = top_diff_this_bin * w2 / count; scalar_t g3 = top_diff_this_bin * w3 / count; scalar_t g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); } // if } // ix } // iy } // CUDA_1D_KERNEL_LOOP } // RoIAlignBackward #endif // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ROI_POOL_CUDA_KERNEL_CUH #define ROI_POOL_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void roi_pool_forward_cuda_kernel( const int nthreads, const T* input, const T* rois, T* output, int* argmax, const int pooled_height, const int pooled_width, const T spatial_scale, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; // calculate the roi region on feature maps T roi_x1 = offset_rois[1] * spatial_scale; T roi_y1 = offset_rois[2] * spatial_scale; T roi_x2 = (offset_rois[3] + 1) * spatial_scale; T roi_y2 = (offset_rois[4] + 1) * spatial_scale; // force malformed rois to be 1x1 T roi_w = roi_x2 - roi_x1; T roi_h = roi_y2 - roi_y1; if (roi_w <= 0 || roi_h <= 0) continue; T bin_size_w = roi_w / static_cast(pooled_width); T bin_size_h = roi_h / static_cast(pooled_height); // the corresponding bin region int bin_x1 = floorf(static_cast(pw) * bin_size_w + roi_x1); int bin_y1 = floorf(static_cast(ph) * bin_size_h + roi_y1); int bin_x2 = ceilf(static_cast(pw + 1) * bin_size_w + roi_x1); int bin_y2 = ceilf(static_cast(ph + 1) * bin_size_h + roi_y1); // add roi offsets and clip to input boundaries bin_x1 = min(max(bin_x1, 0), width); bin_y1 = min(max(bin_y1, 0), height); bin_x2 = min(max(bin_x2, 0), width); bin_y2 = min(max(bin_y2, 0), height); bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1); const T* offset_input = input + (roi_batch_ind * channels + c) * height * width; // Define an empty pooling region to be zero // If nothing is pooled, argmax = -1 causes nothing to be backprop'd T max_val = is_empty ? 0 : -FLT_MAX; int max_idx = -1; for (int h = bin_y1; h < bin_y2; ++h) { for (int w = bin_x1; w < bin_x2; ++w) { int offset = h * width + w; if (offset_input[offset] > max_val) { max_val = offset_input[offset]; max_idx = offset; } } } output[index] = max_val; if (argmax != NULL) argmax[index] = max_idx; } } template __global__ void roi_pool_backward_cuda_kernel( const int nthreads, const T* grad_output, const T* rois, const int* argmax, T* grad_input, const int pooled_height, const int pooled_width, const int channels, const int height, const int width) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c) is an element in the pooled output int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; int roi_batch_ind = rois[n * 5]; T* grad_input_offset = grad_input + ((roi_batch_ind * channels + c) * height * width); int argmax_index = argmax[index]; if (argmax_index != -1) { atomicAdd(grad_input_offset + argmax_index, grad_output[index]); } } } #endif // ROI_POOL_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH #define ROIAWARE_POOL3D_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, T &local_x, T &local_y) { T cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } template __device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, T &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, // cz in the bottom center T x = pt[0], y = pt[1], z = pt[2]; T cx = box3d[0], cy = box3d[1], cz = box3d[2]; T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > z_size / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); return in_flag; } template __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, int out_x, int out_y, int out_z, const T *rois, const T *pts, int *pts_mask) { // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N, // npoints): -1 means point does not in this box, otherwise: encode (x_idxs, // y_idxs, z_idxs) by binary bit int box_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { if (box_idx >= boxes_num) return; pts += pt_idx * 3; rois += box_idx * 7; pts_mask += box_idx * pts_num + pt_idx; T local_x = 0, local_y = 0; int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y); pts_mask[0] = -1; if (cur_in_flag > 0) { T local_z = pts[2] - rois[2]; T x_size = rois[3], y_size = rois[4], z_size = rois[5]; T x_res = x_size / out_x; T y_res = y_size / out_y; T z_res = z_size / out_z; unsigned int x_idx = int((local_x + x_size / 2) / x_res); unsigned int y_idx = int((local_y + y_size / 2) / y_res); unsigned int z_idx = int(local_z / z_res); x_idx = min(max(x_idx, 0), out_x - 1); y_idx = min(max(y_idx, 0), out_y - 1); z_idx = min(max(z_idx, 0), out_z - 1); unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; pts_mask[0] = idx_encoding; } } } template __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, int max_pts_each_voxel, int out_x, int out_y, int out_z, const int *pts_mask, T *pts_idx_of_voxels) { // params pts_mask: (N, npoints) 0 or 1 // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) { int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel; for (int k = 0; k < pts_num; k++) { if (pts_mask[box_idx * pts_num + k] != -1) { unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; unsigned int x_idx = (idx_encoding >> 16) & 0xFF; unsigned int y_idx = (idx_encoding >> 8) & 0xFF; unsigned int z_idx = idx_encoding & 0xFF; unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + y_idx * out_z * max_pts_each_voxel + z_idx * max_pts_each_voxel; unsigned int cnt = pts_idx_of_voxels[base_offset]; if (cnt < max_num_pts) { pts_idx_of_voxels[base_offset + cnt + 1] = k; pts_idx_of_voxels[base_offset]++; } } } } } template __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, const T *pts_feature, const int *pts_idx_of_voxels, T *pooled_features, int *argmax) { // params pts_feature: (npoints, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C) // params argmax: (N, out_x, out_y, out_z, C) int box_idx = blockIdx.z; int channel_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; int argmax_idx = -1; float max_val = -1e50; int total_pts = pts_idx_of_voxels[0]; for (int k = 1; k <= total_pts; k++) { if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) { max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; argmax_idx = pts_idx_of_voxels[k]; } } if (argmax_idx != -1) { pooled_features[0] = max_val; } argmax[0] = argmax_idx; } } template __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, const T *pts_feature, const int *pts_idx_of_voxels, T *pooled_features) { // params pts_feature: (npoints, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C) // params argmax: (N, out_x, out_y, out_z, C) int box_idx = blockIdx.z; int channel_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; float sum_val = 0; int total_pts = pts_idx_of_voxels[0]; for (int k = 1; k <= total_pts; k++) { sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; } if (total_pts > 0) { pooled_features[0] = sum_val / total_pts; } } } template __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z, const int *argmax, const T *grad_out, T *grad_in) { // params argmax: (N, out_x, out_y, out_z, C) // params grad_out: (N, out_x, out_y, out_z, C) // params grad_in: (npoints, C), return value int box_idx = blockIdx.z; int channel_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; if (argmax[0] == -1) return; atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); } } template __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z, int max_pts_each_voxel, const int *pts_idx_of_voxels, const T *grad_out, T *grad_in) { // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params grad_out: (N, out_x, out_y, out_z, C) // params grad_in: (npoints, C), return value int box_idx = blockIdx.z; int channel_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; int total_pts = pts_idx_of_voxels[0]; float cur_grad = 1 / fmaxf(float(total_pts), 1.0); for (int k = 1; k <= total_pts; k++) { atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, grad_out[0] * cur_grad); } } } #endif // ROIAWARE_POOL3D_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH #define ROIPOINT_POOL3D_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, T &local_x, T &local_y) { T cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } template __device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, T &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the // bottom center T x = pt[0], y = pt[1], z = pt[2]; T cx = box3d[0], cy = box3d[1], cz = box3d[2]; T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6]; cz += dz / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > dz / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) & (local_y > -dy / 2.0) & (local_y < dy / 2.0); return in_flag; } template __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const T *xyz, const T *boxes3d, int *pts_assign) { // params xyz: (B, N, 3) // params boxes3d: (B, M, 7) // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means // background points int box_idx = blockIdx.y; int bs_idx = blockIdx.z; CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { if (box_idx >= boxes_num || bs_idx >= batch_size) return; int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx; pts_assign[assign_idx] = 0; int box_offset = bs_idx * boxes_num * 7 + box_idx * 7; int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3; T local_x = 0, local_y = 0; int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y); pts_assign[assign_idx] = cur_in_flag; } } __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num, const int *pts_assign, int *pts_idx, int *pooled_empty_flag) { // params xyz: (B, N, 3) // params pts_feature: (B, N, C) // params pts_assign: (B, N) // params pts_idx: (B, M, 512) // params pooled_empty_flag: (B, M) CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) { int bs_idx = blockIdx.y; int cnt = 0; for (int k = 0; k < pts_num; k++) { if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) { if (cnt < sampled_pts_num) { pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k; cnt++; } else break; } } if (cnt == 0) { pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1; } else if (cnt < sampled_pts_num) { // duplicate same points for sampling for (int k = cnt; k < sampled_pts_num; k++) { int duplicate_idx = k % cnt; int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num; pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx]; } } } } template __global__ void roipoint_pool3d_forward( int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature, T *pooled_features, int *pooled_empty_flag) { // params xyz: (B, N, 3) // params pts_idx: (B, M, 512) // params pts_feature: (B, N, C) // params pooled_features: (B, M, 512, 3+C) // params pooled_empty_flag: (B, M) int box_idx = blockIdx.y; int bs_idx = blockIdx.z; CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) { if (box_idx >= boxes_num || bs_idx >= batch_size) return; if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return; int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx; int src_pt_idx = pts_idx[temp_idx]; int dst_feature_offset = temp_idx * (3 + feature_in_len); for (int j = 0; j < 3; j++) pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j]; int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len; memcpy(pooled_features + dst_feature_offset + 3, pts_feature + src_feature_offset, feature_in_len * sizeof(T)); } } #endif // ROIPOINT_POOL3D_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu #ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH #define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void rotated_feature_align_forward_kernel( const int nthreads, const int points, const scalar_t* bottom_data, const scalar_t* best_bboxes, const scalar_t spatial_scale, const int channels, const int height, const int width, scalar_t* top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int w = index % width; int h = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; const scalar_t* bbox_offset = best_bboxes + ((n * height + h) * width + w) * 5; scalar_t roi_y = bbox_offset[0] * spatial_scale; scalar_t roi_x = bbox_offset[1] * spatial_scale; scalar_t px[5] = {roi_x, 0, 0, 0, 0}; scalar_t py[5] = {roi_y, 0, 0, 0, 0}; if (points > 1) { scalar_t roi_w = bbox_offset[2] * spatial_scale; scalar_t roi_h = bbox_offset[3] * spatial_scale; scalar_t roi_a = bbox_offset[4]; scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2; scalar_t cosa = cosf(roi_a), sina = sinf(roi_a); scalar_t wx = cosa * w_2, wy = sina * w_2; scalar_t hx = -sina * h_2, hy = cosa * h_2; px[1] = roi_x + wx + hx; py[1] = roi_y + wy + hy; px[2] = roi_x - wx + hx; py[2] = roi_y - wy + hy; px[3] = roi_x - wx - hx; py[3] = roi_y - wy - hy; px[4] = roi_x + wx - hx; py[4] = roi_y + wy - hy; } const scalar_t* offset_bottom_data = bottom_data + (n * channels + c) * height * width; scalar_t output_val = bottom_data[index]; for (int i = 0; i < points; i++) { output_val += bilinear_interpolate(offset_bottom_data, height, width, py[i], px[i], i); } top_data[index] = output_val; } } template __global__ void rotated_feature_align_backward_kernel( const int nthreads, const int points, const scalar_t* top_diff, const scalar_t* best_bboxes, const scalar_t spatial_scale, const int channels, const int height, const int width, scalar_t* bottom_diff) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int w = index % width; int h = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; const scalar_t* bbox_offset = best_bboxes + ((n * height + h) * width + w) * 5; scalar_t roi_y = bbox_offset[0] * spatial_scale; scalar_t roi_x = bbox_offset[1] * spatial_scale; scalar_t px[5] = {roi_x, 0, 0, 0, 0}; scalar_t py[5] = {roi_y, 0, 0, 0, 0}; if (points > 1) { scalar_t roi_w = bbox_offset[2] * spatial_scale; scalar_t roi_h = bbox_offset[3] * spatial_scale; scalar_t roi_a = bbox_offset[4]; scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2; scalar_t cosa = cosf(roi_a), sina = sinf(roi_a); scalar_t wx = cosa * w_2, wy = sina * w_2; scalar_t hx = -sina * h_2, hy = cosa * h_2; px[1] = roi_x + wx + hx; py[1] = roi_y + wy + hy; px[2] = roi_x - wx + hx; py[2] = roi_y - wy + hy; px[3] = roi_x - wx - hx; py[3] = roi_y - wy - hy; px[4] = roi_x + wx - hx; py[4] = roi_y + wy - hy; } scalar_t* offset_bottom_diff = bottom_diff + (n * channels + c) * height * width; scalar_t value_top_diff = top_diff[index]; atomicAdd(bottom_diff + index, value_top_diff); for (int i = 0; i < points; i++) { scalar_t w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, py[i], px[i], w1, w2, w3, w4, x_low, x_high, y_low, y_high, i); scalar_t g1 = value_top_diff * w1; scalar_t g2 = value_top_diff * w2; scalar_t g3 = value_top_diff * w3; scalar_t g4 = value_top_diff * w4; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); } } } } #endif // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef SCATTER_POINTS_CUDA_KERNEL_CUH #define SCATTER_POINTS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t; int const maxGridDim = 50000; __device__ __forceinline__ static void reduceMax(float *address, float val) { int *address_as_i = reinterpret_cast(address); int old = *address_as_i, assumed; do { assumed = old; old = atomicCAS(address_as_i, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed)))); } while (assumed != old || __int_as_float(old) < val); } __device__ __forceinline__ static void reduceMax(double *address, double val) { unsigned long long *address_as_ull = reinterpret_cast(address); unsigned long long old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS( address_as_ull, assumed, __double_as_longlong(fmax(val, __longlong_as_double(assumed)))); } while (assumed != old || __longlong_as_double(old) < val); } // get rid of meaningless warnings when compiling host code #ifdef MMCV_WITH_HIP __device__ __forceinline__ static void reduceAdd(float *address, float val) { atomicAdd(address, val); } __device__ __forceinline__ static void reduceAdd(double *address, double val) { atomicAdd(address, val); } #else #ifdef __CUDA_ARCH__ __device__ __forceinline__ static void reduceAdd(float *address, float val) { #if (__CUDA_ARCH__ < 200) #ifdef _MSC_VER #pragma message( \ "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32") #else #warning \ "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32" #endif int *address_as_i = reinterpret_cast(address); int old = *address_as_i, assumed; do { assumed = old; old = atomicCAS(address_as_i, assumed, __float_as_int(val + __int_as_float(assumed))); } while (assumed != old); #else atomicAdd(address, val); #endif } __device__ __forceinline__ static void reduceAdd(double *address, double val) { #if (__CUDA_ARCH__ < 600) #ifdef _MSC_VER #pragma message( \ "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64") #else #warning \ "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64" #endif unsigned long long *address_as_ull = reinterpret_cast(address); unsigned long long old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); #else atomicAdd(address, val); #endif } #endif // __CUDA_ARCH__ #endif // MMCV_WITH_HIP template __global__ void feats_reduce_kernel( const T *feats, const int32_t *coors_map, T *reduced_feats, // shall be 0 at initialization const int num_input, const int num_feats, const reduce_t reduce_type) { CUDA_1D_KERNEL_LOOP(x, num_input) { int32_t reduce_to = coors_map[x]; if (reduce_to == -1) continue; const T *feats_offset = feats + x * num_feats; T *reduced_feats_offset = reduced_feats + reduce_to * num_feats; if (reduce_type == reduce_t::MAX) { for (int i = 0; i < num_feats; i++) { reduceMax(&reduced_feats_offset[i], feats_offset[i]); } } else { for (int i = 0; i < num_feats; i++) { reduceAdd(&reduced_feats_offset[i], feats_offset[i]); } } } } template __global__ void add_reduce_traceback_grad_kernel( T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map, const int32_t *reduce_count, const int num_input, const int num_feats, const reduce_t reduce_type) { CUDA_1D_KERNEL_LOOP(x, num_input) { int32_t reduce_to = coors_map[x]; if (reduce_to == -1) { continue; } const int input_offset = x * num_feats; T *grad_feats_offset = grad_feats + input_offset; const int reduced_offset = reduce_to * num_feats; const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset; if (reduce_type == reduce_t::SUM) { for (int i = 0; i < num_feats; i++) { grad_feats_offset[i] = grad_reduced_feats_offset[i]; } } else if (reduce_type == reduce_t::MEAN) { for (int i = 0; i < num_feats; i++) { grad_feats_offset[i] = grad_reduced_feats_offset[i] / static_cast(reduce_count[reduce_to]); } } } } template __global__ void max_reduce_traceback_scatter_idx_kernel( const T *feats, const T *reduced_feats, int32_t *reduce_from, const int32_t *coors_map, const int num_input, const int num_feats) { CUDA_1D_KERNEL_LOOP(x, num_input) { int32_t reduce_to = coors_map[x]; const int input_offset = x * num_feats; const T *feats_offset = feats + input_offset; if (reduce_to == -1) { continue; } const int reduced_offset = reduce_to * num_feats; const T *reduced_feats_offset = reduced_feats + reduced_offset; int32_t *reduce_from_offset = reduce_from + reduced_offset; for (int i = 0; i < num_feats; i++) { if (feats_offset[i] == reduced_feats_offset[i]) { atomicMin(&reduce_from_offset[i], static_cast(x)); } } } } template __global__ void max_reduce_scatter_grad_kernel(T *grad_feats, const T *grad_reduced_feats, const int32_t *reduce_from, const int num_reduced, const int num_feats) { CUDA_1D_KERNEL_LOOP(x, num_reduced) { const int reduced_offset = x * num_feats; const int32_t *scatter_to_offset = reduce_from + reduced_offset; const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset; for (int i = 0; i < num_feats; i++) { grad_feats[scatter_to_offset[i] * num_feats + i] = grad_reduced_feats_offset[i]; } } } #endif // SCATTER_POINTS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH #define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void sigmoid_focal_loss_forward_cuda_kernel( const int nthreads, const T* input, const int64_t* target, const T* weight, T* output, const T gamma, const T alpha, const int num_classes) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int n = index / num_classes; int c = index % num_classes; int64_t t = target[n]; T flag_p = (t == c); T flag_n = (t != c); // p = sigmoid(x) = 1. / 1. + expf(-x) T p = (T)1. / ((T)1. + expf(-input[index])); // (1 - p)**gamma * log(p) T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN)); // p**gamma * log(1 - p) T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN)); output[index] = (T)0.; output[index] += -flag_p * alpha * term_p; output[index] += -flag_n * ((T)1. - alpha) * term_n; if (weight != NULL) { output[index] *= weight[t]; } } } template __global__ void sigmoid_focal_loss_backward_cuda_kernel( const int nthreads, const T* input, const int64_t* target, const T* weight, T* grad_input, const T gamma, const T alpha, const int num_classes) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int n = index / num_classes; int c = index % num_classes; int64_t t = target[n]; T flag_p = (t == c); T flag_n = (t != c); // p = sigmoid(x) = 1. / 1. + expf(-x) T p = (T)1. / ((T)1. + exp(-input[index])); // (1 - p)**gamma * (1 - p - gamma*p*log(p)) T term_p = pow(((T)1. - p), gamma) * ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN)))); // p**gamma * (gamma * (1 - p) * log(1 - p) - p) T term_n = pow(p, gamma) * (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p); grad_input[index] = (T)0.; grad_input[index] += -flag_p * alpha * term_p; grad_input[index] += -flag_n * ((T)1. - alpha) * term_n; if (weight != NULL) { grad_input[index] *= weight[t]; } } } #endif // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH #define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void softmax_focal_loss_forward_cuda_kernel( const int nthreads, const T* softmax, const int64_t* target, const T* weight, T* output, const T gamma, const T alpha, const int num_classes) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int64_t label = target[index]; T pred = softmax[index * num_classes + label]; if (label >= 0) { output[index] = -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN)); } else { output[index] = 0; } if (weight != NULL) { output[index] *= weight[label]; } } } template __global__ void softmax_focal_loss_backward_cuda1_kernel( const int nthreads, const T* softmax, const int64_t* target, const T* weight, T* buff, const T gamma, const T alpha, const int num_classes) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int64_t label = target[index]; T pred = softmax[index * num_classes + label]; if (label >= 0) { buff[index] = alpha * (-pow((T)1. - pred, gamma) + gamma * pow((T)1. - pred, gamma - 1) * pred * log(max(pred, (T)FLT_MIN))); } else { buff[index] = 0; } if (weight != NULL) { buff[index] *= weight[label]; } } } template __global__ void softmax_focal_loss_backward_cuda2_kernel( const int nthreads, const T* softmax, const int64_t* target, const T* buff, T* grad_input, const int num_classes) { CUDA_1D_KERNEL_LOOP(index, nthreads) { int n = index / num_classes; int c = index % num_classes; int64_t label = target[n]; if (label >= 0) { T flag = (label == c ? (T)1. : (T)0.); grad_input[index] = buff[n] * (flag - softmax[index]); } else { grad_input[index] = 0; } } } #endif // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/spconv/indice.cuh ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef INDICE_CU_H_ #define INDICE_CU_H_ #include #include #include template __global__ void prepareIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; auto indicePairsDim2 = indicePairs.dim(2); Index index; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPos( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 0, oldNum) = ix; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); indicePairs(offset, 1, oldNum) = index; indicePairUnique[offset * indicePairsDim2 + oldNum] = index; } } } template __global__ void prepareDeConvIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; auto indicePairsDim2 = indicePairs.dim(2); Index index; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPosTranspose( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 0, oldNum) = ix; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); indicePairs(offset, 1, oldNum) = index; indicePairUnique[offset * indicePairsDim2 + oldNum] = index; } } } template __global__ void assignGridAndIndiceOutKernel( tv::TensorView indicesOut, tv::TensorView gridsOut, int numAct, tv::TensorView indicePairs, tv::TensorView indicePairUnique, const tv::SimpleVector outSpatialShape, int batchSize) { Index index; auto indicesOutPtr = indicesOut.data(); for (int ix : tv::KernelLoopX(numAct)) { index = indicePairUnique[ix]; gridsOut[index] = ix; index = tv::rowArrayIdxInv( index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data()); indicesOut[ix * (NDim + 1)] = index % batchSize; } } template __global__ void assignIndicePairsKernel( tv::TensorView indicesOut, tv::TensorView gridsOut, int numActIn, tv::TensorView indicePairs, tv::TensorView indicePairUnique, const tv::SimpleVector outSpatialShape) { Index index; int kernelVolume = indicePairs.dim(0); for (int ix : tv::KernelLoopX(numActIn)) { for (int i = 0; i < kernelVolume; ++i) { index = indicePairs(i, 1, ix); if (index > -1) { indicePairs(i, 1, ix) = gridsOut[index]; } } } } template __global__ void prepareSubMGridKernel( tv::TensorView indicesIn, tv::TensorView gridsOut, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index index = 0; for (int ix : tv::KernelLoopX(numActIn)) { index = tv::rowArrayIdx(indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); gridsOut[index] = ix; } } template __global__ void getSubMIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; Index index = 0; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPos( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (int i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); if (gridsOut[index] > -1) { auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 1, oldNum) = gridsOut[index]; indicePairs(offset, 0, oldNum) = ix; } } } } template __global__ void resetGridKernel(const Index *indicePairUnique, tv::TensorView gridsOut, int numAct) { for (int ix : tv::KernelLoopX(numAct)) { gridsOut[indicePairUnique[ix]] = -1; } } template __global__ void resetGridSubMKernel( const Index *indices, tv::TensorView gridsOut, const tv::SimpleVector outSpatialShape, int numAct) { int outSpatialShapeReg[NDim]; for (int i = 0; i < NDim; ++i) { outSpatialShapeReg[i] = outSpatialShape[i]; } Index spatialVolume = 1; auto indsPtr = indices; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index index; for (int ix : tv::KernelLoopX(numAct)) { indsPtr = indices + ix * (NDim + 1); index = tv::rowArrayIdx(indsPtr + 1, outSpatialShapeReg); gridsOut[index + spatialVolume * indsPtr[0]] = -1; } } #endif ================================================ FILE: mmcv/ops/csrc/common/cuda/spconv/reordering.cuh ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef REORDERING_CU_H_ #define REORDERING_CU_H_ #include template __global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features, const Index *indices, int size, int numPlanes) { int ILPStrideX[NumILP]; Index inds[NumILP]; #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; for (int ix : tv::KernelLoopX(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) { if (ix + ILPStrideX[ilp] < size) inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; } for (int iy : tv::KernelLoopY(numPlanes)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { if (ix + ILPStrideX[ilp] < size) buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = features[inds[ilp] + iy]; } } } } template __global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features, const Index *indices, int size, int numPlanes) { int ILPStrideX[NumILP]; Index inds[NumILP]; #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; for (int ix : tv::KernelLoopX(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) { if (ix + ILPStrideX[ilp] < size) inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; } for (int iy : tv::KernelLoopY(numPlanes)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { if (ix + ILPStrideX[ilp] < size) reinterpret_cast( buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] = reinterpret_cast(features)[inds[ilp] + iy]; } } } } template __global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features, const Index *indices, int size, int numPlanes) { int ILPStrideY[NumILP]; #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y; features += blockIdx.x * NumTLP; buffer += blockIdx.x * NumTLP; for (int iy : tv::KernelLoopY(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { reinterpret_cast( buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] = reinterpret_cast( features)[indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x]; } } } template __global__ void scatterAddGenericKernel(scalar_t *outFeatures, const scalar_t *buffer, const Index *indices, int size, int numPlanes) { int ILPStrideX[NumILP]; Index inds[NumILP]; #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; for (int ix : tv::KernelLoopX(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) { if (ix + ILPStrideX[ilp] < size) inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; } for (int iy : tv::KernelLoopY(numPlanes)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { if (ix + ILPStrideX[ilp] < size) { outFeatures[inds[ilp] + iy] += buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]; } } } } } template __global__ void scatterAddVecBlockKernel(scalar_t *outFeatures, const scalar_t *buffer, const Index *indices, int size, int numPlanes) { int ILPStrideY[NumILP]; constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t); #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y; outFeatures += blockIdx.x * NumTLP; buffer += blockIdx.x * NumTLP; scalar_t buf[vecloadFactor]; scalar_t buf2[vecloadFactor]; Index idx; for (int iy : tv::KernelLoopY(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x; reinterpret_cast(buf)[0] = reinterpret_cast(outFeatures)[idx]; reinterpret_cast(buf2)[0] = reinterpret_cast( buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x]; #pragma unroll for (int i = 0; i < vecloadFactor; i++) { buf[i] += buf2[i]; } reinterpret_cast(outFeatures)[idx] = reinterpret_cast(buf)[0]; } } } #endif ================================================ FILE: mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu #ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH #define STACK_BALL_QUERY_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void stack_ball_query_forward_cuda_kernel( int B, int M, float radius, int nsample, const T *new_xyz, const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt, int *idx) { // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features // :param xyz_batch_cnt: (batch_size), [N1, N2, ...] // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...] // output: // idx: (M, nsample) const T *cur_xyz = xyz; int *cur_idx = idx; CUDA_1D_KERNEL_LOOP(pt_idx, M) { int bs_idx = 0; for (int pt_cnt = 0; bs_idx < B; bs_idx++) { pt_cnt += new_xyz_batch_cnt[bs_idx]; if (pt_idx < pt_cnt) break; } int xyz_batch_start_idx = 0; for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k]; const T *new_xyz_p = new_xyz + pt_idx * 3; cur_xyz += xyz_batch_start_idx * 3; cur_idx += pt_idx * nsample; float radius2 = radius * radius; T new_x = new_xyz_p[0]; T new_y = new_xyz_p[1]; T new_z = new_xyz_p[2]; int n = xyz_batch_cnt[bs_idx]; int cnt = 0; for (int k = 0; k < n; ++k) { T x = cur_xyz[k * 3 + 0]; T y = cur_xyz[k * 3 + 1]; T z = cur_xyz[k * 3 + 2]; T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < radius2) { if (cnt == 0) { for (int l = 0; l < nsample; ++l) { cur_idx[l] = k; } } cur_idx[cnt] = k; ++cnt; if (cnt >= nsample) break; } } if (cnt == 0) cur_idx[0] = -1; } } #endif // STACK_BALL_QUERY_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu #ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH #define STACK_GROUP_POINTS_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif #include template __global__ void stack_group_points_forward_cuda_kernel( int b, int c, int m, int nsample, const T *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, T *out) { // :param features: (N1 + N2 ..., C) tensor of features to group // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor // containing the indices of features to group with :param idx_batch_cnt: // (batch_size) [M1 + M2 ...] tensor containing the indices of features to // group with :return: // output: (M1 + M2, C, nsample) tensor CUDA_1D_KERNEL_LOOP(index, m * c * nsample) { const T *cur_features = features; const int *cur_idx = idx; int sample_idx = index % nsample; int c_idx = (index / nsample) % c; int pt_idx = (index / nsample / c); if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return; int bs_idx = 0, pt_cnt = idx_batch_cnt[0]; for (int k = 1; k < b; k++) { if (pt_idx < pt_cnt) break; pt_cnt += idx_batch_cnt[k]; bs_idx = k; } int features_batch_start_idx = 0; int features_batch_end_idx = features_batch_cnt[0]; for (int k = 0; k < bs_idx; k++) { features_batch_start_idx += features_batch_cnt[k]; features_batch_end_idx = features_batch_start_idx + features_batch_cnt[k + 1]; } cur_features += features_batch_start_idx * c; cur_idx += pt_idx * nsample + sample_idx; int in_idx = cur_idx[0] * c + c_idx; int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx; if (in_idx < features_batch_end_idx * c) { out[out_idx] = cur_features[in_idx]; } } } template __global__ void stack_group_points_backward_cuda_kernel( int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) { // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing // the indices of features to group with :param idx_batch_cnt: (batch_size) // [M1 + M2 ...] tensor containing the indices of features to group with // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the // indices of features to group with :return: // grad_features: (N1 + N2 ..., C) gradient of the features CUDA_1D_KERNEL_LOOP(index, m * c * nsample) { const T *cur_grad_out = grad_out; const int *cur_idx = idx; T *cur_grad_features = grad_features; int sample_idx = index % nsample; int c_idx = (index / nsample) % c; int pt_idx = (index / nsample / c); if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return; int bs_idx = 0, pt_cnt = idx_batch_cnt[0]; for (int k = 1; k < b; k++) { if (pt_idx < pt_cnt) break; pt_cnt += idx_batch_cnt[k]; bs_idx = k; } int features_batch_start_idx = 0; for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k]; cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx; cur_idx += pt_idx * nsample + sample_idx; cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx; atomicAdd(cur_grad_features, cur_grad_out[0]); } } #endif // GROUP_POINTS_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef SYNCBN_CUDA_KERNEL_CUH #define SYNCBN_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean, int num, int channels, int spatial) { __shared__ float buffer[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; buffer[tid] += input[index]; } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer[tid] += buffer[tid + s]; } __syncthreads(); } int total = num * spatial; if (tid == 0) { mean[c] = buffer[0] / total; } } template <> __global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input, float *mean, int num, int channels, int spatial) { __shared__ float buffer[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; buffer[tid] += static_cast(input[index]); } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer[tid] += buffer[tid + s]; } __syncthreads(); } int total = num * spatial; if (tid == 0) { mean[c] = buffer[0] / total; } } template __global__ void sync_bn_forward_var_cuda_kernel(const T *input, const float *mean, float *var, int num, int channels, int spatial) { __shared__ float buffer[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; float td = input[index] - mean[c]; buffer[tid] += td * td; } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer[tid] += buffer[tid + s]; } __syncthreads(); } int total = num * spatial; if (tid == 0) { var[c] = buffer[0] / total; } } template <> __global__ void sync_bn_forward_var_cuda_kernel(const phalf *input, const float *mean, float *var, int num, int channels, int spatial) { __shared__ float buffer[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; float td = static_cast(input[index]) - mean[c]; buffer[tid] += td * td; } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer[tid] += buffer[tid + s]; } __syncthreads(); } int total = num * spatial; if (tid == 0) { var[c] = buffer[0] / total; } } template __global__ void sync_bn_forward_output_cuda_kernel( const T *input, const float *mean, const float *var, float *running_mean, float *running_var, const float *weight, const float *bias, float *norm, float *std, T *output, int num, int channels, int spatial, float eps, float momentum, int group_size) { int tid = threadIdx.x; int c = blockIdx.x; float mean_value = mean[c]; float std_value = sqrt(var[c] + eps); if (weight != nullptr) { float weight_value = weight[c]; float bias_value = bias[c]; if (norm != nullptr) { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; norm[index] = (input[index] - mean_value) / std_value; output[index] = norm[index] * weight_value + bias_value; } } else { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = (input[index] - mean_value) / std_value * weight_value + bias_value; } } } else { if (norm != nullptr) { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = norm[index] = (input[index] - mean_value) / std_value; } } else { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = (input[index] - mean_value) / std_value; } } } if (tid == 0) { if (std != nullptr) std[c] = std_value; if (running_mean != nullptr) { running_mean[c] = momentum * mean_value + (1 - momentum) * running_mean[c]; int count = num * spatial * group_size; float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c]; running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c]; } } } template <> __global__ void sync_bn_forward_output_cuda_kernel( const phalf *input, const float *mean, const float *var, float *running_mean, float *running_var, const float *weight, const float *bias, float *norm, float *std, phalf *output, int num, int channels, int spatial, float eps, float momentum, int group_size) { int tid = threadIdx.x; int c = blockIdx.x; float mean_value = mean[c]; float std_value = sqrt(var[c] + eps); if (weight != nullptr) { float weight_value = weight[c]; float bias_value = bias[c]; if (norm != nullptr) { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; norm[index] = (static_cast(input[index]) - mean_value) / std_value; output[index] = static_cast(norm[index] * weight_value + bias_value); } } else { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = static_cast((static_cast(input[index]) - mean_value) / std_value * weight_value + bias_value); } } } else { if (norm != nullptr) { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; norm[index] = (static_cast(input[index]) - mean_value) / std_value; output[index] = static_cast(norm[index]); } } else { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = static_cast( (static_cast(input[index]) - mean_value) / std_value); } } } if (tid == 0) { if (std != nullptr) std[c] = std_value; if (running_mean != nullptr) { running_mean[c] = momentum * mean_value + (1 - momentum) * running_mean[c]; int count = num * spatial * group_size; float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c]; running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c]; } } } template __global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output, const float *norm, float *grad_weight, float *grad_bias, int num, int channels, int spatial) { __shared__ float buffer1[THREADS_PER_BLOCK]; __shared__ float buffer2[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer1[tid] = buffer2[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; buffer1[tid] += grad_output[index] * norm[index]; buffer2[tid] += grad_output[index]; } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer1[tid] += buffer1[tid + s]; buffer2[tid] += buffer2[tid + s]; } __syncthreads(); } if (tid == 0) { grad_weight[c] = buffer1[0]; grad_bias[c] = buffer2[0]; } } template <> __global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output, const float *norm, float *grad_weight, float *grad_bias, int num, int channels, int spatial) { __shared__ float buffer1[THREADS_PER_BLOCK]; __shared__ float buffer2[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer1[tid] = buffer2[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; buffer1[tid] += static_cast(grad_output[index]) * norm[index]; buffer2[tid] += static_cast(grad_output[index]); } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer1[tid] += buffer1[tid + s]; buffer2[tid] += buffer2[tid + s]; } __syncthreads(); } if (tid == 0) { grad_weight[c] = buffer1[0]; grad_bias[c] = buffer2[0]; } } template __global__ void sync_bn_backward_data_cuda_kernel( int output_size, const T *grad_output, const float *weight, const float *grad_weight, const float *grad_bias, const float *norm, const float *std, T *grad_input, int num, int channels, int spatial) { int factor = num * spatial; CUDA_1D_KERNEL_LOOP(index, output_size) { int c = (index / spatial) % channels; grad_input[index] = weight[c] * (grad_output[index] - (grad_weight[c] * norm[index] + grad_bias[c]) / factor) / std[c]; } } template <> __global__ void sync_bn_backward_data_cuda_kernel( int output_size, const phalf *grad_output, const float *weight, const float *grad_weight, const float *grad_bias, const float *norm, const float *std, phalf *grad_input, int num, int channels, int spatial) { int factor = num * spatial; CUDA_1D_KERNEL_LOOP(index, output_size) { int c = (index / spatial) % channels; grad_input[index] = static_cast( weight[c] * (static_cast(grad_output[index]) - (grad_weight[c] * norm[index] + grad_bias[c]) / factor) / std[c]); } } #endif // SYNCBN_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH #define THREE_INTERPOLATE_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void three_interpolate_forward_cuda_kernel( int b, int c, int m, int n, const T *points, const int *__restrict__ idx, const T *weight, T *out) { // points: (B, C, M) // idx: (B, N, 3) // weight: (B, N, 3) // output: // out: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, n) { if (bs_idx >= b || c_idx >= c) return; weight += bs_idx * n * 3 + pt_idx * 3; points += bs_idx * c * m + c_idx * m; idx += bs_idx * n * 3 + pt_idx * 3; out += bs_idx * c * n + c_idx * n; out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]]; } } template __global__ void three_interpolate_backward_cuda_kernel( int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx, const T *weight, T *grad_points) { // grad_out: (B, C, N) // weight: (B, N, 3) // output: // grad_points: (B, C, M) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, n) { if (bs_idx >= b || c_idx >= c) return; grad_out += bs_idx * c * n + c_idx * n + pt_idx; weight += bs_idx * n * 3 + pt_idx * 3; grad_points += bs_idx * c * m + c_idx * m; idx += bs_idx * n * 3 + pt_idx * 3; atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); } } #endif // THREE_INTERPOLATE_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef THREE_NN_CUDA_KERNEL_CUH #define THREE_NN_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void three_nn_forward_cuda_kernel(int b, int n, int m, const T *unknown, const T *known, T *dist2, int *__restrict__ idx) { // unknown: (B, N, 3) // known: (B, M, 3) // output: // dist2: (B, N, 3) // idx: (B, N, 3) int bs_idx = blockIdx.y; CUDA_1D_KERNEL_LOOP(pt_idx, n) { if (bs_idx >= b) return; unknown += bs_idx * n * 3 + pt_idx * 3; known += bs_idx * m * 3; dist2 += bs_idx * n * 3 + pt_idx * 3; idx += bs_idx * n * 3 + pt_idx * 3; T ux = unknown[0]; T uy = unknown[1]; T uz = unknown[2]; double best1 = 1e40, best2 = 1e40, best3 = 1e40; int besti1 = 0, besti2 = 0, besti3 = 0; for (int k = 0; k < m; ++k) { T x = known[k * 3 + 0]; T y = known[k * 3 + 1]; T z = known[k * 3 + 2]; T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); if (d < best1) { best3 = best2; besti3 = besti2; best2 = best1; besti2 = besti1; best1 = d; besti1 = k; } else if (d < best2) { best3 = best2; besti3 = besti2; best2 = d; besti2 = k; } else if (d < best3) { best3 = d; besti3 = k; } } dist2[0] = best1; dist2[1] = best2; dist2[2] = best3; idx[0] = besti1; idx[1] = besti2; idx[2] = besti3; } } #endif // THREE_NN_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef TIN_SHIFT_CUDA_KERNEL_CUH #define TIN_SHIFT_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif template __global__ void tin_shift_forward_cuda_kernel( const int nthreads, const T* input, const int* shift, T* output, const int batch_size, const int channels, const int t_size, const int hw_size, const int group_size, const int group_channel) { CUDA_1D_KERNEL_LOOP(index, nthreads) { const int hw_index = index % hw_size; const int j = (index / hw_size) % channels; const int n_index = (index / hw_size / channels) % batch_size; int group_id = j / group_channel; int t_shift = shift[n_index * group_size + group_id]; int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index; for (int i = 0; i < t_size; i++) { int now_t = i + t_shift; int data_id = i * hw_size * channels + offset; if (now_t < 0 || now_t >= t_size) { continue; } int out_id = now_t * hw_size * channels + offset; output[out_id] = input[data_id]; } } } template __global__ void tin_shift_backward_cuda_kernel( const int nthreads, const T* input, const int* shift, T* output, const int batch_size, const int channels, const int t_size, const int hw_size, const int group_size, const int group_channel) { CUDA_1D_KERNEL_LOOP(index, nthreads) { const int hw_index = index % hw_size; const int j = (index / hw_size) % channels; const int n_index = (index / hw_size / channels) % batch_size; int group_id = j / group_channel; int t_shift = shift[n_index * group_size + group_id]; int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index; for (int i = 0; i < t_size; i++) { int now_t = i + t_shift; int data_id = i * hw_size * channels + offset; if (now_t < 0 || now_t >= t_size) { continue; } int out_id = now_t * hw_size * channels + offset; output[out_id] = input[data_id]; } } } #endif // TIN_SHIFT_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh ================================================ // Copyright (c) OpenMMLab. All rights reserved. #ifndef VOXELIZATION_CUDA_KERNEL_CUH #define VOXELIZATION_CUDA_KERNEL_CUH #ifdef MMCV_USE_PARROTS #include "parrots_cuda_helper.hpp" #else #include "pytorch_cuda_helper.hpp" #endif typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t; template __global__ void dynamic_voxelize_kernel( const T* points, T_int* coors, const float voxel_x, const float voxel_y, const float voxel_z, const float coors_x_min, const float coors_y_min, const float coors_z_min, const float coors_x_max, const float coors_y_max, const float coors_z_max, const int grid_x, const int grid_y, const int grid_z, const int num_points, const int num_features, const int NDim) { // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; CUDA_1D_KERNEL_LOOP(index, num_points) { // To save some computation auto points_offset = points + index * num_features; auto coors_offset = coors + index * NDim; int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x); if (c_x < 0 || c_x >= grid_x) { coors_offset[0] = -1; continue; } int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y); if (c_y < 0 || c_y >= grid_y) { coors_offset[0] = -1; coors_offset[1] = -1; continue; } int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z); if (c_z < 0 || c_z >= grid_z) { coors_offset[0] = -1; coors_offset[1] = -1; coors_offset[2] = -1; } else { coors_offset[0] = c_z; coors_offset[1] = c_y; coors_offset[2] = c_x; } } } template __global__ void assign_point_to_voxel(const int nthreads, const T* points, T_int* point_to_voxelidx, T_int* coor_to_voxelidx, T* voxels, const int max_points, const int num_features, const int num_points, const int NDim) { CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; int index = thread_idx / num_features; int num = point_to_voxelidx[index]; int voxelidx = coor_to_voxelidx[index]; if (num > -1 && voxelidx > -1) { auto voxels_offset = voxels + voxelidx * max_points * num_features + num * num_features; int k = thread_idx % num_features; voxels_offset[k] = points[thread_idx]; } } } template __global__ void assign_voxel_coors(const int nthreads, T_int* coor, T_int* point_to_voxelidx, T_int* coor_to_voxelidx, T_int* voxel_coors, const int num_points, const int NDim) { CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; // if (index >= num_points) return; int index = thread_idx / NDim; int num = point_to_voxelidx[index]; int voxelidx = coor_to_voxelidx[index]; if (num == 0 && voxelidx > -1) { auto coors_offset = voxel_coors + voxelidx * NDim; int k = thread_idx % NDim; coors_offset[k] = coor[thread_idx]; } } } template __global__ void point_to_voxelidx_kernel(const T_int* coor, T_int* point_to_voxelidx, T_int* point_to_pointidx, const int max_points, const int max_voxels, const int num_points, const int NDim) { CUDA_1D_KERNEL_LOOP(index, num_points) { auto coor_offset = coor + index * NDim; // skip invalid points if (coor_offset[0] == -1) continue; int num = 0; int coor_x = coor_offset[0]; int coor_y = coor_offset[1]; int coor_z = coor_offset[2]; // only calculate the coors before this coor[index] for (int i = 0; i < index; ++i) { auto prev_coor = coor + i * NDim; if (prev_coor[0] == -1) continue; // Find all previous points that have the same coors // if find the same coor, record it if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) && (prev_coor[2] == coor_z)) { num++; if (num == 1) { // point to the same coor that first show up point_to_pointidx[index] = i; } else if (num >= max_points) { // out of boundary break; } } } if (num == 0) { point_to_pointidx[index] = index; } if (num < max_points) { point_to_voxelidx[index] = num; } } } template __global__ void determin_voxel_num( // const T_int* coor, T_int* num_points_per_voxel, T_int* point_to_voxelidx, T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num, const int max_points, const int max_voxels, const int num_points) { // only calculate the coors before this coor[index] for (int i = 0; i < num_points; ++i) { int point_pos_in_voxel = point_to_voxelidx[i]; // record voxel if (point_pos_in_voxel == -1) { // out of max_points or invalid point continue; } else if (point_pos_in_voxel == 0) { // record new voxel int voxelidx = voxel_num[0]; if (voxel_num[0] >= max_voxels) continue; voxel_num[0] += 1; coor_to_voxelidx[i] = voxelidx; num_points_per_voxel[voxelidx] = 1; } else { int point_idx = point_to_pointidx[i]; int voxelidx = coor_to_voxelidx[point_idx]; if (voxelidx != -1) { coor_to_voxelidx[i] = voxelidx; num_points_per_voxel[voxelidx] += 1; } } } } __global__ void nondeterministic_get_assign_pos( const int nthreads, const int32_t* coors_map, int32_t* pts_id, int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) { CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { int coors_idx = coors_map[thread_idx]; if (coors_idx > -1) { int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1); pts_id[thread_idx] = coors_pts_pos; if (coors_pts_pos == 0) { coors_order[coors_idx] = atomicAdd(coors_count, 1); } } } } template __global__ void nondeterministic_assign_point_voxel( const int nthreads, const T* points, const int32_t* coors_map, const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count, const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count, const int max_voxels, const int max_points, const int num_features, const int NDim) { CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { int coors_idx = coors_map[thread_idx]; int coors_pts_pos = pts_id[thread_idx]; if (coors_idx > -1 && coors_pts_pos < max_points) { int coors_pos = coors_order[coors_idx]; if (coors_pos < max_voxels) { auto voxels_offset = voxels + (coors_pos * max_points + coors_pts_pos) * num_features; auto points_offset = points + thread_idx * num_features; for (int k = 0; k < num_features; k++) { voxels_offset[k] = points_offset[k]; } if (coors_pts_pos == 0) { pts_count[coors_pos] = min(reduce_count[coors_idx], max_points); auto coors_offset = coors + coors_pos * NDim; auto coors_in_offset = coors_in + coors_idx * NDim; for (int k = 0; k < NDim; k++) { coors_offset[k] = coors_in_offset[k]; } } } } } } #endif // VOXELIZATION_CUDA_KERNEL_CUH ================================================ FILE: mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp ================================================ /************************************************************************* * Copyright (C) 2021 Cambricon. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ #ifndef COMMON_MLU_HELPER_HPP_ #define COMMON_MLU_HELPER_HPP_ #define NFU_ALIGN_SIZE 128 // Byte #define REM_FOR_STACK (128 * 1024) // 128KB reserved for cncc #ifdef __BANG_ARCH__ #define MAX_NRAM_SIZE \ (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc #define MAX_SRAM_SIZE \ (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc #else #define MAX_NRAM_SIZE (384 * 1024) // 384KB, initialization value #define MAX_SRAM_SIZE (1920 * 1024) // 1920KB, initialization value #endif #ifndef PAD_UP #define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y)) #endif #ifndef PAD_DOWN #define PAD_DOWN(x, y) (((x) / (y)) * (y)) #endif #define CEIL_ALIGN(x, y) (((x) + (y) - 1) / (y) * (y)) template __mlu_func__ inline scalar_t min(scalar_t a, scalar_t b) { return a < b ? a : b; } template __mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) { return a > b ? a : b; } /*! * @brief Converts int32 to float32 data type. * * @param[out] dst * Pointer to NRAM that stores int32 type data. * @param[in,out] dst_addition * Pointer to NRAM as the workspace of dst, which has the same size as dst. * It allows empty pointer on MLU300 series. * @param[in] src * Pointer to NRAM that stores float32 type data. * @param[in,out] src_addition * Pointer to NRAM as the workspace of src, which has a size of 128 Bytes. * It allows empty pointer on MLU300 series. * @param[in] src_count * The count of elements in src. */ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src, float *src_addition, const int src_count) { #if __BANG_ARCH__ >= 300 __bang_int2float((float *)dst, (int32_t *)src, src_count, 0); #else // get sign bit const float move_23bit = 8388608.0; // 0x80000000 = 1,000000000,0000000000000000000000000000 __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x80000000); __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition, src_count * sizeof(float), NFU_ALIGN_SIZE); // get 1 or 0 from sign bit // judg is Odd __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x00000001); __bang_cycle_bor((char *)dst_addition, (char *)dst_addition, (char *)src_addition, src_count * sizeof(float), NFU_ALIGN_SIZE); __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x80000001); __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); // minus xor, positive num invariant __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0xffffffff); __bang_cycle_mul(dst, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float)); // convert int32 to float32 __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff); __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition, src_count * sizeof(float), NFU_ALIGN_SIZE); __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x4b000000); __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition, src_count * sizeof(float), NFU_ALIGN_SIZE); __bang_sub_scalar(dst, dst, move_23bit, src_count); // add one __bang_add(dst, dst, dst_addition, src_count); // set sign for float32 __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0xffffffff); __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x00000001); __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x80000000); __bang_cycle_band((char *)dst_addition, (char *)dst_addition, (char *)src_addition, src_count * 4, 128); __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4); #endif // __BANG_ARCH__ >= 300 } /*! * @brief Converts float32 to int32 data type with to_zero round mode. * * @param[out] dst * Pointer to NRAM that stores float32 type data. * @param[in,out] dst_addition * Pointer to NRAM as the workspace of dst, which has the same size as dst. * It allows empty pointer on MLU300 series. * @param[in] src * Pointer to NRAM that stores int32 type data. * @param[in,out] src_addition * Pointer to NRAM as the workspace of src, which has a size of 128 Bytes. * It allows empty pointer on MLU300 series. * @param[in] src_count * The count of elements in src. */ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, float *src_addition, const int src_count) { #if __BANG_ARCH__ >= 300 __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0); #else // sign ===> src_addition // dst=-1.0 : when src[i] is a negative number // dst=+1.0 : when src[i] is a positive number const int floatDchar = sizeof(float) / sizeof(char); __bang_active_sign((float *)dst, src, src_count); // dst_addition = abs(src) __bang_mul(dst_addition, src, (float *)dst, src_count); // if dst_addition < 1.0 , then src_addition + 1, to fix add error. __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f); __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count); __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0xbf800000); // set negative flag -1.0 = 0xbf80000 __bang_cycle_eq( (float *)dst, (float *)dst, (float *)src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0] __bang_active_abs(dst_addition, src, src_count); __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f); // mask shift move 23 __bang_cycle_add_tz( dst_addition, dst_addition, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); // right shift move 23bit // two`s complement for negatibe // dst=1.0 , when src <-1.0 // dst=0.0 , when src >=-1.0 __bang_sub(dst_addition, dst_addition, (float *)dst, src_count); // to fix max value // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0, // means max value. __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count); __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst, src_count * floatDchar); // get low 23bit __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), (unsigned)0x007fffff); // mask low 23bit is 1 __bang_cycle_band((char *)dst_addition, (char *)dst_addition, (char *)src_addition, src_count * floatDchar, NFU_ALIGN_SIZE / sizeof(char)); // set 9 high bit ===> dst // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000 // 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000 __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000); __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count, NFU_ALIGN_SIZE / sizeof(float)); // src or dst_addition __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition, src_count * floatDchar); __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count); __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * floatDchar); #endif // __BANG_ARCH__ >= 300 } /*! * @brief Converts float32 to half data type, * the rounding mode on MLU200 is rd, on MLU300 is rn. * * @param[out] dst * Pointer to NRAM that stores half type data. * @param[in] src * Pointer to NRAM that stores float32 type data. * @param[in] src_count * The count of elements in src. */ __mlu_func__ inline void convertFloat2half(half *dst, float *src, int src_count) { #if __BANG_ARCH__ >= 300 __bang_float2half_rn(dst, src, src_count); #else __bang_float2half_rd(dst, src, src_count); #endif } /*! * @brief recursiveSumPool. * @param[in,out] dst * Pointer to NRAM that stores the input and output data. * @param[in] low_dim * Which is the number of low dim. * @param[in] high_dim * Which is the number of high dim. * @param[in] kernel_limit * Which is the high_dim of sumpool per time. ******************************************************************************/ template __mlu_func__ void recursiveSumPool(T *dst, int low_dim, int high_dim, int kernel_limit) { for (; high_dim > 1;) { int repeat_s = high_dim / kernel_limit; int remain_s = high_dim % kernel_limit; if (remain_s) { __bang_sumpool((T *)dst, (T *)dst, low_dim, 1, remain_s, 1, remain_s, 1, 1); } if (repeat_s) { __bang_sumpool((T *)dst + (remain_s > 0 ? low_dim : 0), (T *)dst + remain_s * low_dim, low_dim, kernel_limit * repeat_s, 1, kernel_limit, 1, 1, kernel_limit); } high_dim = repeat_s + (bool)remain_s; } return; } #endif // COMMON_MLU_HELPER_HPP_ ================================================ FILE: mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu ================================================ /************************************************************************* * Copyright (C) 2022 Cambricon. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ #include "common_mlu_helper.hpp" __nram__ char nram_buffer[MAX_NRAM_SIZE]; template __mlu_func__ void MLUUnion1MaskedIm2colForward( const T *feature, const int height, const int width, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt, T *data_col) { for (int index = taskId; index < mask_cnt; index += taskDim) { const int h_col = mask_h_idx[index]; const int w_col = mask_w_idx[index]; const int h_offset = h_col - pad_h; const int w_offset = w_col - pad_w; int h_start = h_offset; int h_end = h_offset + kernel_h - 1; int w_start = w_offset; int w_end = w_start + kernel_w - 1; if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) { continue; } else { int h_start_valid = max(0, h_start); int h_end_valid = min(height - 1, h_end); int w_start_valid = max(0, w_start); int w_end_valid = min(width - 1, w_end); __memcpy( data_col + index * kernel_h * kernel_w * channels + ((h_start_valid - h_start) * kernel_w + (w_start_valid - w_start)) * channels, feature + h_start_valid * width * channels + w_start_valid * channels, (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM, kernel_w * channels * sizeof(T), width * channels * sizeof(T), h_end_valid - h_start_valid); } } } template __mlu_func__ void MLUUnion1MaskedCol2imForward(const T *col, const int height, const int width, const int channels, const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt, T *im) { const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T); if (channels <= channels_max_num_nram) { const int deal_num = channels_max_num_nram / channels; int mask_per_core = mask_cnt / taskDim; const int mask_remain = mask_cnt % taskDim; mask_per_core += taskId < mask_remain ? 1 : 0; int index_start = taskId < mask_remain ? taskId * mask_per_core : taskId * mask_per_core + mask_remain; int loop = mask_per_core / deal_num; int remain_num = mask_per_core % deal_num; T *nram_col = (T *)nram_buffer; for (int index = 0; index < loop; ++index) { int cur_index = index_start + index * deal_num; __memcpy(nram_col, col + cur_index * channels, deal_num * channels * sizeof(T), GDRAM2NRAM); for (int i = 0; i < deal_num; ++i) { int mask_index = cur_index + i; const int h_im = mask_h_idx[mask_index]; const int w_im = mask_w_idx[mask_index]; // if(h_im>=height || w_im>=width) continue; __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels, channels * sizeof(T), NRAM2GDRAM); } } if (remain_num > 0) { int cur_index = index_start + loop * deal_num; __memcpy(nram_col, col + cur_index * channels, remain_num * channels * sizeof(T), GDRAM2NRAM); for (int i = 0; i < remain_num; ++i) { int mask_index = cur_index + i; const int h_im = mask_h_idx[mask_index]; const int w_im = mask_w_idx[mask_index]; // if(h_im>=height || w_im>=width) continue; __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels, channels * sizeof(T), NRAM2GDRAM); } } } else { for (int index = taskId; index < mask_cnt; index += taskDim) { const int m_index = index % mask_cnt; const int h_im = mask_h_idx[m_index]; const int w_im = mask_w_idx[m_index]; // if(h_im>=height || w_im>=width) continue; __memcpy(im + (h_im * width + w_im) * channels, col + index * channels, channels * sizeof(T), GDRAM2GDRAM); } } } __mlu_global__ void MLUKernelMaskedIm2colForward( const void *feature, const int height, const int width, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt, void *data_col, const cnrtDataType_t data_dtype) { if (coreId == 0x80) { return; } switch (data_dtype) { case CNRT_FLOAT16: { MLUUnion1MaskedIm2colForward((half *)feature, height, width, channels, kernel_h, kernel_w, pad_h, pad_w, (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, mask_cnt, (half *)data_col); }; break; case CNRT_FLOAT32: { MLUUnion1MaskedIm2colForward((float *)feature, height, width, channels, kernel_h, kernel_w, pad_h, pad_w, (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, mask_cnt, (float *)data_col); }; break; default: { break; } } } __mlu_global__ void MLUKernelMaskedCol2imForward( const void *col, const int height, const int width, const int channels, const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt, void *im, const cnrtDataType_t data_dtype) { if (coreId == 0x80) { return; } switch (data_dtype) { case CNRT_FLOAT16: { MLUUnion1MaskedCol2imForward((half *)col, height, width, channels, (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, mask_cnt, (half *)im); }; break; case CNRT_FLOAT32: { MLUUnion1MaskedCol2imForward((float *)col, height, width, channels, (int32_t *)mask_h_idx, (int32_t *)mask_w_idx, mask_cnt, (float *)im); }; break; default: { break; } } } void KernelMaskedIm2colForward( cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, cnrtDataType_t k_dtype, const void *im_ptr, const int height, const int width, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const void *mask_h_idx_ptr, const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr) { MLUKernelMaskedIm2colForward<<>>( im_ptr, height, width, channels, kernel_h, kernel_w, pad_h, pad_w, mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr, k_dtype); } void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, cnrtDataType_t k_dtype, const void *col_ptr, const int height, const int width, const int channels, const void *mask_h_idx_ptr, const void *mask_w_idx_ptr, const int mask_cnt, void *im_ptr) { MLUKernelMaskedCol2imForward<<>>( col_ptr, height, width, channels, mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, im_ptr, k_dtype); } ================================================ FILE: mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu ================================================ /************************************************************************* * Copyright (C) 2022 Cambricon. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ #include "common_mlu_helper.hpp" #define ALIGN_SIZE 64 #define PIPELINE_COMMON_NUM 2 #define PIPELINE_PINGPONG_NUM 10 __nram__ char nram_buffer[MAX_NRAM_SIZE]; namespace forward { template __mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height, int width, int channels, int p_height, int p_width, T spatial_scale, int *bin_x1, int *bin_y1, int *bin_x2, int *bin_y2, int *bin_wdim, int *bin_hdim, int *bin_dims, T **input_base, bool *is_empty) { int pw = bin_i % p_width; int ph = (bin_i / p_width) % p_height; int roi_n = bin_i / p_width / p_height; /*roi*/ const T *roi_info = rois_v + roi_n * 5; // {{batch, x1, y1, x2, y2},,,} int batch_index = (int)roi_info[0]; int roi_x1 = round(roi_info[1] * spatial_scale); int roi_y1 = round(roi_info[2] * spatial_scale); int roi_x2 = round(roi_info[3] * spatial_scale); int roi_y2 = round(roi_info[4] * spatial_scale); int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1; int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1; /*bin*/ T bin_w = (T)roi_w / (T)p_width; T bin_h = (T)roi_h / (T)p_height; *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1; *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0; *bin_x1 = *bin_x1 < width ? *bin_x1 : width; *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1; *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0; *bin_y1 = *bin_y1 < height ? *bin_y1 : height; *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1; *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0; *bin_x2 = *bin_x2 < width ? *bin_x2 : width; *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1; *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0; *bin_y2 = *bin_y2 < height ? *bin_y2 : height; *input_base = input_v + batch_index * height * width * channels; *bin_wdim = *bin_x2 - *bin_x1; *bin_hdim = *bin_y2 - *bin_y1; *bin_dims = (*bin_hdim) * (*bin_wdim); *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1); } template __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, int channels, int height, int width, int p_height, int p_width, int rois_num, T spatial_scale, T *output_v, int *argmax) { /* * NRAM partition * |---------------------------------------------------| * | ping | * |---------------------------------------------------| * | pong | * |---------------------------------------------------| * | out | * |---------------------------------------------------| * | argmax | * |---------------------------------------------------| * | a | * |---------------------------------------------------| * | b | * |---------------------------------------------------| */ uint32_t is_half = sizeof(T) == sizeof(half) ? true : false; uint32_t t_size = sizeof(T); uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float); uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half); uint32_t channels_align = PAD_UP(channels, float_div); uint32_t nram_limit = PAD_DOWN( (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div); // nram PING/PONG, output, argamx, a, b float *nram_ping = (float *)nram_buffer; float *nram_pong = (float *)nram_buffer + nram_limit; float *nram_out = (float *)nram_buffer + 2 * nram_limit; float *nram_argmax = nram_out + channels_align; float *nram_a = nram_out + 2 * channels_align; float *nram_b = nram_out + 3 * channels_align; uint32_t c_bins_num = rois_num * p_height * p_width; uint32_t task_bins = c_bins_num / taskDim; uint32_t rem_bins = c_bins_num % taskDim; if (taskId < rem_bins) { task_bins += 1; } int bin_first = (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId); int bins_loop = bin_first + task_bins; T *input_base = NULL; T *output_base = output_v + bin_first * channels; int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL; int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims; int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims; bool is_empty = false; bool pong_is_empty = false; bool is_first_bin = true; uint32_t src_offset = 0; uint32_t dst_offset = 0; uint32_t nram_offset = 0; uint32_t half_offset = is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0; float *nram_tmp = NULL; uint32_t c_slice = 0; uint32_t c_slice_align = 0; uint32_t pongc_slice = 0; uint32_t pongc_slice_align = 0; for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) { getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels, p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1, &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims, &input_base, &is_empty); uint32_t c_rem = channels; c_slice = nram_limit / bin_dims / float_div * float_div; if (is_first_bin && !is_empty) { c_slice = c_slice > c_rem ? c_rem : c_slice; c_slice_align = PAD_UP(c_slice, float_div); for (int h = bin_y1; h < bin_y2; h++) { src_offset = (h * width + bin_x1) * channels; nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset; if (c_slice_align == channels) { __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset, bin_wdim * c_slice * t_size, GDRAM2NRAM); } else { __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset, c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size, channels * t_size, bin_wdim - 1); } } } uint32_t c_offset = 0; while (c_rem > 0) { c_slice = c_slice > c_rem ? c_rem : c_slice; c_slice_align = PAD_UP(c_slice, float_div); /*__memcpy_async*/ if (c_rem - c_slice > 0 && !is_empty) { pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice; pongc_slice_align = PAD_UP(pongc_slice, float_div); for (int h = bin_y1; h < bin_y2; h++) { src_offset = (h * width + bin_x1) * channels + c_offset; nram_offset = (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset; __memcpy_async((T *)nram_pong + nram_offset, (T *)input_base + src_offset + c_slice, pongc_slice * t_size, GDRAM2NRAM, pongc_slice_align * t_size, channels * t_size, bin_wdim - 1); } } else if (bin_i + 1 < bins_loop) { getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width, channels, p_height, p_width, (T)spatial_scale, &pbin_x1, &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim, &pbin_dims, &input_base, &pong_is_empty); pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div); pongc_slice = pongc_slice > channels ? channels : pongc_slice; pongc_slice_align = PAD_UP(pongc_slice, float_div); if (!pong_is_empty) { for (int h = pbin_y1; h < pbin_y2; h++) { src_offset = (h * width + pbin_x1) * channels; nram_offset = (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset; if (pongc_slice_align == channels) { __memcpy_async((T *)nram_pong + nram_offset, (T *)input_base + src_offset, pbin_wdim * pongc_slice * t_size, GDRAM2NRAM); } else { __memcpy_async((T *)nram_pong + nram_offset, (T *)input_base + src_offset, pongc_slice * t_size, GDRAM2NRAM, pongc_slice_align * t_size, channels * t_size, pbin_wdim - 1); } } } } if (is_empty) { __bang_write_value((T *)nram_out, c_slice_align, (T)0); __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, c_slice * t_size, NRAM2GDRAM); if (NULL != argmax) { __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1)); __memcpy((int32_t *)argmax_base + dst_offset + c_offset, (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM); } } else { if (is_half) { uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div); __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset, bin_align64); } __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align, bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1); if (is_half) { uint32_t c_align64 = PAD_UP(c_slice_align, half_div); __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64); } __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, c_slice * t_size, NRAM2GDRAM); if (NULL != argmax) { /*compute max_index*/ __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping, c_slice_align, bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1); convertInt2Float((float *)nram_argmax, (float *)nram_a, (int32_t *)nram_out, (float *)nram_b, c_slice_align); /*compute input_h*/ for (int i = 0; i < c_slice; i++) { nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim); } __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1, c_slice_align); __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width, c_slice_align); /*compute input_w*/ __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim, c_slice_align); __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a, c_slice_align); __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1, c_slice_align); __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a, c_slice_align); convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a, (float *)nram_out, (float *)nram_b, c_slice_align); __memcpy((int32_t *)argmax_base + dst_offset + c_offset, (int32_t *)nram_argmax, c_slice * sizeof(int32_t), NRAM2GDRAM); } } nram_tmp = nram_ping; nram_ping = nram_pong; nram_pong = nram_tmp; c_offset += c_slice; c_rem -= c_slice; __asm__ volatile("sync;"); } dst_offset += channels; is_first_bin = false; } } __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type, const void *input_data, const void *input_rois, int batch, int channels, int height, int width, int pooled_height, int pooled_width, int rois_num, float spatial_scale, void *output_data, int *argmax) { switch (data_type) { case CNRT_FLOAT16: { MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels, height, width, pooled_height, pooled_width, rois_num, (half)spatial_scale, (half *)output_data, argmax); }; break; case CNRT_FLOAT32: { MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch, channels, height, width, pooled_height, pooled_width, rois_num, (float)spatial_scale, (float *)output_data, argmax); }; break; default: { break; } } } } // namespace forward namespace backward { // Convert index of argmax from global grads_image to local bin in RoI. Vector // operations do not support int type, so conversion from int to float is // performed here. __mlu_func__ void convertIndex( int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1, int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int, int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w, int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w, float *nram_atomic_add, float *nram_grads_image, int width, int height, int wstart, int hstart, int w_compute, int h_compute, int align_c, int channels, int loop_flag, int loop_id, int true_limit) { convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); // This step uses scalar division, because the above vector division causes // rounding accuracy problem. for (int i = 0; i < channels; ++i) { *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width; } // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width' // operation. convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1, (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2, align_c); convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2, align_c); // Perform 'temp_result - hstart' operation __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart, align_c); // Perform 'temp_result1 - temp_result2 * width' operation __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width, align_c); convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, (float *)nram_argmax_fp_w, align_c); // Perform 'temp_result - wstart' operation __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart, align_c); // Perform 'temp_result = h * w_compute + w' operation __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, w_compute, align_c); __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_w, align_c); if (loop_flag == 1) { __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, (loop_id * true_limit), align_c); } convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1, (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2, align_c); } template __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, const int32_t *argmax, T *grads_image, int channels, int height, int width, int pooled_height, int pooled_width, int rois_num, const T spatial_scale, int high_precision) { // Calculate the number of rois processed by each core int bin_num = rois_num * pooled_height * pooled_width; int loop = (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim); int tid = taskId * loop; if (bin_num % taskDim != 0) { if (tid >= bin_num) { return; } else { // last part is (bin_num - tid). loop = bin_num - tid < loop ? bin_num - tid : loop; } } int align_c = PAD_UP(channels, ALIGN_SIZE); // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM. int data_size = PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c - (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) / 2), ALIGN_SIZE); int hw_limit = data_size / align_c; float *nram_grads = (float *)nram_buffer; for (int idx = tid; idx < tid + loop; ++idx) { // (n, ph, pw) is a C in the pooled output int pw = idx % pooled_width; int ph = (idx / pooled_width) % pooled_height; int n = idx / pooled_width / pooled_height; const T *offset_rois = (const T *)(rois + n * 5); int roi_batch_ind = int(offset_rois[0]); // Calculate the roi region on feature maps int roi_start_w = round(offset_rois[1] * spatial_scale); int roi_start_h = round(offset_rois[2] * spatial_scale); int roi_end_w = round(offset_rois[3] * spatial_scale); int roi_end_h = round(offset_rois[4] * spatial_scale); // Force malformed rois to 1x1 int roi_width = roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1; int roi_height = roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1; T bin_size_h = (T)roi_height / (T)pooled_height; T bin_size_w = (T)roi_width / (T)pooled_width; // The corresponding bin region int hstart = int(floor((T)ph * bin_size_h)); int wstart = int(floor((T)pw * bin_size_w)); int hend = int(ceil((T)(ph + 1) * bin_size_h)); int wend = int(ceil((T)(pw + 1) * bin_size_w)); // Add roi offsets and clip to input boundaries, min(max(A, B), C); hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0; hstart = hstart < height ? hstart : height; hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0; hend = hend < height ? hend : height; wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0; wstart = wstart < width ? wstart : width; wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0; wend = wend < width ? wend : width; bool is_empty = (hend <= hstart) || (wend <= wstart); if (!is_empty) { int h_compute = hend - hstart; int w_compute = wend - wstart; int true_limit = hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute; int loop_int = (h_compute * w_compute) / true_limit; int rem = (h_compute * w_compute) % true_limit; int32_t *nram_argmax = (int32_t *)nram_grads + align_c; int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c; int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c; int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c; int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c; int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c; int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c; int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c; int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c; float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c; float *nram_grads_image = (float *)nram_atomic_add + align_c; if (true_limit == h_compute * w_compute) { /* * NRAM partition * |---------------------------------------------------| * | grads | * |---------------------------------------------------| * | argmax | * |---------------------------------------------------| * | argmax_temp | * |---------------------------------------------------| * | atomic_add | * |---------------------------------------------------| * | grads_image | * |---------------------------------------------------| */ // Load the data from GDRAM to NRAM. __memcpy( (T *)nram_grads + align_c * high_precision, (const T *)grads + (n * pooled_height * pooled_width + ph * pooled_width + pw) * channels, channels * sizeof(T), GDRAM2NRAM); if (high_precision) { __bang_half2float((float *)nram_grads, (half *)nram_grads + align_c * high_precision, align_c); } __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax + (n * pooled_height * pooled_width + ph * pooled_width + pw) * channels, channels * sizeof(int32_t), GDRAM2NRAM); // Perform pooling operation on NRAM. convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, nram_atomic_add, nram_grads_image, width, height, wstart, hstart, w_compute, h_compute, align_c, channels, 0, 0, 0); __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, (int32_t *)nram_argmax_int, align_c, h_compute, w_compute, h_compute, w_compute, h_compute, w_compute); if (high_precision) { __bang_float2half_rd((half *)nram_grads_image, (float *)nram_grads_image, h_compute * w_compute * align_c); } // Store the result on NRAM back to GDRAM. for (int hc = 0; hc < h_compute; ++hc) { for (int wc = 0; wc < w_compute; ++wc) { T *dst = (T *)nram_atomic_add; int grad_image_offset = (roi_batch_ind * height * width + (hc + hstart) * width + wc + wstart) * channels; T *src1 = (T *)grads_image + grad_image_offset; int nram_grads_image_offset = (hc * w_compute + wc) * align_c; T *src2 = (T *)nram_grads_image + nram_grads_image_offset; __bang_atomic_add(dst, src1, src2, channels); } } } else if (true_limit > 0) { /* * NRAM partition * |---------------------------------------------------| * | grads | * |---------------------------------------------------| * | argmax | * |--------------------ping_pong----------------------| * | argmax_temp | argmax_temp | * |------------------------|--------------------------| * | atomic_add | atomic_add | * |------------------------|--------------------------| * | grads_image | grads_image | * |---------------------------------------------------| */ // Load the data from GDRAM to NRAM. __memcpy( (T *)nram_grads + align_c * high_precision, (const T *)grads + (n * pooled_height * pooled_width + ph * pooled_width + pw) * channels, channels * sizeof(T), GDRAM2NRAM); if (high_precision) { __bang_half2float((float *)nram_grads, (half *)nram_grads + align_c * high_precision, align_c); } __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax + (n * pooled_height * pooled_width + ph * pooled_width + pw) * channels, channels * sizeof(int32_t), GDRAM2NRAM); int ping_pong = 0; int ping_pong_offset = (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2; for (int loop_id = 0; loop_id <= loop_int; ++loop_id) { int size = (loop_id == loop_int) ? rem : true_limit; if (size == 0) { break; } // Perform pooling operation on NRAM. nram_argmax_fp = (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset; nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c; nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c; nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c; nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c; nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c; nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c; nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c; nram_atomic_add = (float *)nram_argmax_fp_w + align_c; nram_grads_image = (float *)nram_atomic_add + align_c; int loop_id_1 = loop_id; int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit; if (size_1 == 0) { break; } convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, nram_atomic_add, nram_grads_image, width, height, wstart, hstart, w_compute, h_compute, align_c, channels, 1, loop_id_1, true_limit); __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, (int32_t *)nram_argmax_int, align_c, size_1, 1, size_1, 1, size_1, 1); if (high_precision) { __bang_float2half_rd((half *)nram_grads_image, (float *)nram_grads_image, size_1 * align_c); } // Store the result on NRAM back to GDRAM. for (int index_size = 0; index_size < size; ++index_size) { int h = (loop_id * true_limit + index_size) / w_compute; int w = (loop_id * true_limit + index_size) % w_compute; T *dst = (T *)nram_atomic_add; T *grads_image_n = (T *)grads_image + roi_batch_ind * height * width * channels; T *src1 = (T *)grads_image_n + ((h + hstart) * width + (w + wstart)) * channels; T *src2 = (T *)nram_grads_image + index_size * align_c; __bang_atomic_add(dst, src1, src2, channels); } ping_pong = 1 - ping_pong; } } else { /* * NRAM partition * |---------------------------------------------------| * | grads | * |---------------------------------------------------| * | argmax | * |--------------------ping_pong----------------------| * | argmax_temp | argmax_temp | * |------------------------|--------------------------| * | atomic_add | atomic_add | * |------------------------|--------------------------| * | grads_image | grads_image | * |---------------------------------------------------| */ int c_limit = PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) / (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2), ALIGN_SIZE); int loop_int = channels / c_limit; int rem = channels % c_limit; int ping_pong = 0; int ping_pong_offset = (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2; for (int loop_id = 0; loop_id <= loop_int; ++loop_id) { int size = (loop_id == loop_int) ? rem : c_limit; if (size == 0) { break; } nram_argmax_fp = (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset; nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit; nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit; nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit; nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit; nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit; nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit; nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit; nram_atomic_add = (float *)nram_argmax_fp_w + c_limit; nram_grads_image = (float *)nram_atomic_add + c_limit; // This pipeline loads the data from GDRAM to NRAM. __memcpy((T *)nram_grads + c_limit * high_precision, (const T *)grads + n * pooled_height * pooled_width * channels + ph * pooled_width * channels + pw * channels + loop_id * c_limit, size * sizeof(T), GDRAM2NRAM); if (high_precision) { __bang_half2float((float *)nram_grads, (half *)nram_grads + c_limit * high_precision, c_limit); } __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax + n * pooled_height * pooled_width * channels + ph * pooled_width * channels + pw * channels + loop_id * c_limit, size * sizeof(int32_t), GDRAM2NRAM); for (int hc = 0; hc < h_compute; ++hc) { for (int wc = 0; wc < w_compute; ++wc) { // This pipeline performs pooling operation on NRAM. convertIndex( nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, nram_atomic_add, nram_grads_image, width, height, wstart + wc, hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0); __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1, 1, 1); if (high_precision) { __bang_float2half_rd((half *)nram_grads_image, (float *)nram_grads_image, c_limit); } // This pipeline stores the result on NRAM back to GDRAM. T *dst = (T *)nram_atomic_add; T *grads_image_n = (T *)grads_image + roi_batch_ind * height * width * channels; T *src1 = (T *)grads_image_n + ((hc + hstart) * width + (wc + wstart)) * channels + loop_id * c_limit; T *src2 = (T *)nram_grads_image; __bang_atomic_add(dst, src1, src2, size); } } ping_pong = 1 - ping_pong; } } } } } __mlu_global__ void MLUKernelRoiPoolBackward( const void *grads, const void *rois, const int *argmax, void *grads_image, int rois_num, int pooled_height, int pooled_width, int channels, int no, int height, int width, const float spatial_scale, const cnrtDataType_t k_dtype) { // make sure that memcore is not used if (coreId == 0x80) { return; } switch (k_dtype) { case CNRT_FLOAT16: { // Using the float type '__bang_max_pool_bp' instruction to increase the // bit width. const int high_precision = 1; MLUUnion1Roipool((const half *)rois, (const half *)grads, (const int32_t *)argmax, (half *)grads_image, channels, height, width, pooled_height, pooled_width, rois_num, (const half)spatial_scale, high_precision); }; break; case CNRT_FLOAT32: { const int high_precision = 0; MLUUnion1Roipool((const float *)rois, (const float *)grads, (const int32_t *)argmax, (float *)grads_image, channels, height, width, pooled_height, pooled_width, rois_num, (const float)spatial_scale, high_precision); }; break; default: { break; } } } } // namespace backward void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, cnrtDataType_t data_type, const void *input_data, const void *input_rois, const int batch, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int rois_num, const float spatial_scale, void *output_data, int *argmax) { forward::MLUKernelRoiPool<<>>( data_type, input_data, input_rois, batch, channels, height, width, pooled_height, pooled_width, rois_num, spatial_scale, output_data, argmax); } void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, cnrtDataType_t k_dtype, const void *grad_output_ptr, const void *rois_ptr, const int *argmax_ptr, void *grad_input_ptr, const int box_num, const int pooled_height, const int pooled_width, const int channels, const int batch, const int height, const int width, const float spatial_scale) { backward::MLUKernelRoiPoolBackward<<>>( grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num, pooled_height, pooled_width, channels, batch, height, width, spatial_scale, k_dtype); } ================================================ FILE: mmcv/ops/csrc/common/mps/MPSDevice.h ================================================ // Copyright © 2022 Apple Inc. // This file is modify from: // https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h #pragma once #include #include #include #ifdef __OBJC__ #include #include #include typedef id MTLDevice_t; #else typedef void* MTLDevice; typedef void* MTLDevice_t; #endif using namespace std; namespace at { namespace mps { //----------------------------------------------------------------- // MPSDevice // // MPSDevice is a singleton class that returns the default device //----------------------------------------------------------------- class TORCH_API MPSDevice { public: /** * MPSDevice should not be cloneable. */ MPSDevice(MPSDevice& other) = delete; /** * MPSDevice should not be assignable. */ void operator=(const MPSDevice&) = delete; /** * Gets single instance of the Device. */ static MPSDevice* getInstance(); /** * Returns the single device. */ MTLDevice_t device() { return _mtl_device; } ~MPSDevice(); private: static MPSDevice* _device; MTLDevice_t _mtl_device; MPSDevice(); }; TORCH_API bool is_available(); TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false); } // namespace mps } // namespace at ================================================ FILE: mmcv/ops/csrc/common/mps/MPSLibrary.h ================================================ #ifndef _MPS_LIBRARY_H_ #define _MPS_LIBRARY_H_ #include #include #ifdef __OBJC__ #include #include #include typedef id MTLComputePipelineState_t; typedef id MTLLibrary_t; #else typedef void* MTLComputePipelineState; typedef void* MTLComputePipelineState_t; typedef void* MTLLibrary; typedef void* MTLLibrary_t; #endif class MPSLibrary { public: // disable constructor for singleton static MPSLibrary* createFromUrl(const std::string& library_url); static MPSLibrary* createFromSource(const std::string& source); ~MPSLibrary(); MTLLibrary_t library() { return _library; } MTLComputePipelineState_t getComputePipelineState( const std::string& function_name); private: MTLLibrary_t _library; std::unordered_map _pso_map; }; class MPSLibraryManager { public: // disable constructor for singleton MPSLibraryManager(const MPSLibraryManager&) = delete; MPSLibraryManager& operator=(const MPSLibraryManager&) = delete; MPSLibraryManager(MPSLibraryManager&&) = delete; MPSLibraryManager& operator=(MPSLibraryManager&&) = delete; static MPSLibraryManager* getInstance(); bool hasLibrary(const std::string& name); MPSLibrary* getLibrary(const std::string& library_url); MPSLibrary* createLibraryFromSouce(const std::string& name, const std::string& sources); ~MPSLibraryManager(); private: MPSLibraryManager(); std::unordered_map> _library_map; }; #endif ================================================ FILE: mmcv/ops/csrc/common/mps/MPSLibrary.mm ================================================ #include "MPSLibrary.h" #include "MPSDevice.h" static std::unique_ptr mps_library_manager=nullptr; MPSLibraryManager* MPSLibraryManager::getInstance() { if(!mps_library_manager) mps_library_manager = std::unique_ptr(new MPSLibraryManager()); return mps_library_manager.get(); } MPSLibraryManager::~MPSLibraryManager() {} MPSLibraryManager::MPSLibraryManager() {} bool MPSLibraryManager::hasLibrary(const std::string& name) { return _library_map.find(name) != _library_map.end(); } MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) { if (_library_map.find(library_url) != _library_map.end()) { return _library_map[library_url].get(); } _library_map.emplace(std::make_pair( library_url, std::unique_ptr(MPSLibrary::createFromUrl(library_url)))); return _library_map[library_url].get(); } MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name, const std::string& source) { NSString* ns_name = [NSString stringWithCString:name.c_str()]; if (_library_map.find(name) != _library_map.end()) { NSLog(@"Library %@ already exist.", ns_name); return nullptr; } _library_map.emplace( std::make_pair(name, std::unique_ptr(MPSLibrary::createFromSource(source)))); return _library_map[name].get(); } MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) { MPSLibrary* library = new MPSLibrary(); @autoreleasepool { NSError* error = nil; // load library and func NSString* utl_str = [NSString stringWithCString:library_url.c_str()]; NSURL* metal_url = [NSURL fileURLWithPath:utl_str]; library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url error:&error]; if (library->_library == nil) { NSLog(@"Failed to find library, error %@.", error); exit(1); } } return library; } MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) { MPSLibrary* library = new MPSLibrary(); @autoreleasepool { NSError* error = nil; // load library and func NSString* code_str = [NSString stringWithCString:sources.c_str()]; library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str options:nil error:&error]; if (library->_library == nil) { NSLog(@"Failed to find library, error %@.", error); exit(1); } } return library; } MPSLibrary::~MPSLibrary() { [_library release]; _library = nil; } MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) { if (_pso_map.find(function_name) != _pso_map.end()) { return _pso_map[function_name]; } MTLComputePipelineState_t pso; @autoreleasepool { NSError* error = nil; // create function NSString* function_name_str = [NSString stringWithCString:function_name.c_str()]; id func = [_library newFunctionWithName:function_name_str]; if (func == nil) { NSLog(@"Failed to created pipeline state object, error %@.", error); exit(1); } // create pipeline pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func error:&error]; _pso_map.emplace(std::make_pair(function_name, pso)); } return _pso_map[function_name]; } ================================================ FILE: mmcv/ops/csrc/common/mps/MPSStream.h ================================================ // Copyright © 2022 Apple Inc. // This file is modify from: // https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h #pragma once #include #include #include #include #include #include "MPSDevice.h" #ifdef __OBJC__ #include #include #include #include typedef id MTLCommandQueue_t; typedef id MTLCommandBuffer_t; typedef id MTLSharedEvent_t; typedef id MTLDevice_t; #else typedef void* MTLCommandQueue_t; typedef void* MTLCommandQueue; typedef void* MTLCommandBuffer_t; typedef void* MTLCommandBuffer; typedef void* MTLSharedEvent_t; typedef void* dispatch_queue_t; typedef void* MTLDevice_t; #define nil NULL; #endif namespace at { namespace mps { //----------------------------------------------------------------- // MPSStream //----------------------------------------------------------------- class TORCH_API MPSStream { public: enum Unchecked { UNCHECKED }; /// Construct a MPSStream from a Stream. This construction is checked, /// and will raise an error if the Stream is not, in fact, a MPS stream. explicit MPSStream(Stream stream); ~MPSStream(); MTLCommandQueue_t commandQueue() const { return _commandQueue; }; dispatch_queue_t queue() const { return _serialQueue; } MTLCommandBuffer_t commandBuffer(); void commit(bool flush); void commitAndWait(); void synchronize(); void flush(); /// Get the MPS device index that this stream is associated with. c10::DeviceIndex device_index() const { return _stream.device_index(); } MTLCommandQueue_t stream() const { return _commandQueue; }; MTLDevice_t device() const { return [_commandQueue device]; } /// Explicit conversion to Stream. Stream unwrap() const { return _stream; } private: Stream _stream; MTLCommandQueue_t _commandQueue = nil; MTLCommandBuffer_t _commandBuffer = nil; void _flush(bool commitAndWait) const; dispatch_queue_t _serialQueue = nullptr; }; /** * Get the current MPS stream */ TORCH_API MPSStream* getCurrentMPSStream(); /** * Get the default MPS stream */ TORCH_API MPSStream* getDefaultMPSStream(); //----------------------------------------------------------------- // MPSStreamImpl //----------------------------------------------------------------- class TORCH_API MPSStreamImpl { public: /** * Gets single instance of the MPSStream. */ static MPSStream* getInstance(); private: static MPSStream* _stream; MPSStreamImpl(); }; //----------------------------------------------------------------- // MPSEvent //----------------------------------------------------------------- struct TORCH_API MPSEvent { MPSEvent(); // MPSEvent(id device); ~MPSEvent(); MTLSharedEvent_t event() const { return _event; } void recordEvent(MPSStream* stream); void waitForEvent(MPSStream* queue); // waits on the cpu bool queryEvent(); uint64_t getCurrentValue() { return _currentValue; } void setCurrentValue(uint64_t currValue) { _currentValue = currValue; } private: bool _isRecorded = false; uint64_t _currentValue = 0; MTLSharedEvent_t _event; }; typedef MPSEvent* mpsEvent_t; } // namespace mps } // namespace at ================================================ FILE: mmcv/ops/csrc/common/mps/MPSUtils.h ================================================ #ifndef _MPS_UTILS_H_ #define _MPS_UTILS_H_ #include #ifdef __OBJC__ #include #include #include typedef id MTLBuffer_t; typedef id MTLComputeCommandEncoder_t; #else typedef void* MTLBuffer; typedef void* MTLBuffer_t; typedef void* MTLComputeCommandEncoder; typedef void* MTLComputeCommandEncoder_t; #endif // utils static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) { return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data()); } template , at::Tensor>::value, bool> = true> void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t); template , at::Tensor>::value, bool> = true> void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) { [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index]; } template , at::Tensor>::value, bool>> void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) { [encoder setBytes:&t length:sizeof(t) atIndex:index]; } inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {} template void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) { setMTLArg(encoder, index, std::forward(t)); setMTLArgsImpl(encoder, index + 1, std::forward(args)...); } template void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) { [encoder setComputePipelineState:pso]; setMTLArgsImpl(encoder, 0, std::forward(args)...); } #endif ================================================ FILE: mmcv/ops/csrc/common/musa/active_rotated_filter_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu #ifndef ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH #define ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void active_rotated_filter_forward_musa_kernel( const int nthreads, const scalar_t* weight_data, const int* indices_data, const int num_input_planes, const int num_output_planes, const int num_orientations, const int num_rotations, const int nEntry, scalar_t* output_data) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int l = index % nEntry; int j = (index / nEntry) % num_input_planes; int i = index / nEntry / num_input_planes; int k; scalar_t val = *(weight_data + index); for (k = 0; k < num_rotations; k++) { int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; scalar_t* target = output_data + i * (num_rotations * num_input_planes * nEntry) + k * (num_input_planes * nEntry) + j * (nEntry) + idx; *target = val; } } } template __global__ void active_rotated_filter_backward_musa_kernel( const int nthreads, const scalar_t* gradWeight_data, const int* indices_data, const int num_input_planes, const int num_output_planes, const int num_orientations, const int num_rotations, const int nEntry, scalar_t* weight_data) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int l = index % nEntry; int j = (index / nEntry) % num_input_planes; int i = index / nEntry / num_input_planes; int k; scalar_t* val = weight_data + index; *val = 0; scalar_t tmp = 0; for (k = 0; k < num_rotations; k++) { int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; scalar_t target = *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) + k * (num_input_planes * nEntry) + j * (nEntry) + idx); tmp = tmp + target; } *val = tmp; } } #endif // ACTIVE_ROTATED_FILTER_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/assign_score_withk_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH #define ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" // input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K) // output: fout(B,O,N) // algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) = s(b,i,k,m)*p(b,i(k),m,j) // i(k) = idx(b,i,k) // sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j) // avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k // max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j))) template __global__ void assign_score_withk_forward_musa_kernel( const int B, const int N0, const int N1, const int M, const int K, const int O, const int aggregate, const T* points, const T* centers, const T* scores, const int64_t* knn_idx, T* output) { // ----- parallel loop for B, N1, K and O --------- MUSA_1D_KERNEL_LOOP(i, B * O * N1 * K) { // ------- loop for M ---------- const int b = (int)(i / (O * N1 * K)); const int o = (int)(i % (O * N1 * K) / (N1 * K)); const int n = (int)(i % (N1 * K) / K); const int k = (int)(i % K); const int cn = (int)knn_idx[b * K * N1 + n * K + 0]; // The first neighbor is the center point const int kn = (int)knn_idx[b * K * N1 + n * K + k]; if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range return; } assert(b < B); assert(kn < N0); assert(cn < N0); assert(o < O); assert(n < N1); const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k; T val = output[out_idx]; for (int m = 0; m < M; m++) { val += points[b * N0 * M * O + kn * M * O + m * O + o] * scores[b * N1 * K * M + n * K * M + k * M + m] - centers[b * N0 * M * O + cn * M * O + m * O + o] * scores[b * N1 * K * M + n * K * M + k * M + m]; } output[out_idx] = val; } } template __global__ void assign_score_withk_points_backward_musa_kernel( const int B, const int N0, const int N, const int M, const int K, const int O, const int aggregate, const T* grad_out, const T* scores, const int64_t* knn_idx, T* grad_points, T* grad_centers) { // ----- parallel loop for B, M, O --------- MUSA_1D_KERNEL_LOOP(i, B * M * O) { int b = (int)(i / (M * O)); int m = (int)(i % (M * O) / O); int o = (int)(i % O); // ----- loop for N,K --------- for (int n = 0; n < N; n++) { for (int k = 0; k < K; k++) { int kn = knn_idx[b * N * K + n * K + k]; int cn = knn_idx[b * N * K + n * K + 0]; if (kn >= N0 || kn < 0) { // if index overflows, it is out of the // neighborhood range continue; } atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o, scores[b * N * K * M + n * K * M + k * M + m] * grad_out[b * O * N * K + o * N * K + n * K + k]); atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o, -scores[b * N * K * M + n * K * M + k * M + m] * grad_out[b * O * N * K + o * N * K + n * K + k]); } } } } template __global__ void assign_score_withk_scores_backward_musa_kernel( const int B, const int N0, const int N, const int M, const int K, const int O, const int aggregate, const T* grad_out, const T* points, const T* centers, const int64_t* knn_idx, T* grad_scores) { // ----- parallel loop for B, N, K, M --------- MUSA_1D_KERNEL_LOOP(i, B * N * K * M) { const int b = (int)(i / (N * M * K)); const int n = (int)(i % (N * M * K) / M / K); const int k = (int)(i % (M * K) / M); const int m = (int)(i % M); const int cn = knn_idx[b * N * K + n * K + 0]; const int kn = knn_idx[b * N * K + n * K + k]; if (kn >= N0 || kn < 0) { // if index overflows, it is out of the neighborhood range return; } // -------------- loop for O ------------------------ const int out_idx = b * N * K * M + n * K * M + k * M + m; T val = grad_scores[out_idx]; for (int o = 0; o < O; o++) { val += (points[b * N0 * M * O + kn * M * O + m * O + o] - centers[b * N0 * M * O + cn * M * O + m * O + o]) * grad_out[b * O * N * K + o * N * K + n * K + k]; } grad_scores[out_idx] = val; } } #endif // ASSIGN_SCORE_WITHK_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/ball_query_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu #ifndef BALL_QUERY_MUSA_KERNEL_MUH #define BALL_QUERY_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void ball_query_forward_musa_kernel(int b, int n, int m, float min_radius, float max_radius, int nsample, const T* new_xyz, const T* xyz, int* idx) { // new_xyz: (B, M, 3) // xyz: (B, N, 3) // output: // idx: (B, M, nsample) int bs_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, m) { if (bs_idx >= b) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; idx += bs_idx * m * nsample + pt_idx * nsample; float max_radius2 = max_radius * max_radius; float min_radius2 = min_radius * min_radius; T new_x = new_xyz[0]; T new_y = new_xyz[1]; T new_z = new_xyz[2]; int cnt = 0; for (int k = 0; k < n; ++k) { T x = xyz[k * 3 + 0]; T y = xyz[k * 3 + 1]; T z = xyz[k * 3 + 2]; T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { if (cnt == 0) { for (int l = 0; l < nsample; ++l) { idx[l] = k; } } idx[cnt] = k; ++cnt; if (cnt >= nsample) break; } } } } #endif // BALL_QUERY_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/bbox_overlaps_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef BBOX_OVERLAPS_MUSA_KERNEL_MUH #define BBOX_OVERLAPS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1, T& y1, T& x2, T& y2) { x1 = bbox[base]; y1 = bbox[base + 1]; x2 = bbox[base + 2]; y2 = bbox[base + 3]; } template <> __device__ __forceinline__ void load_bbox(const float* bbox, const int base, float& x1, float& y1, float& x2, float& y2) { const float4 bbox_offset = reinterpret_cast(bbox + base)[0]; x1 = bbox_offset.x; y1 = bbox_offset.y; x2 = bbox_offset.z; y2 = bbox_offset.w; } template __global__ void bbox_overlaps_musa_kernel(const T* bbox1, const T* bbox2, T* ious, const int num_bbox1, const int num_bbox2, const int mode, const bool aligned, const int offset) { if (aligned) { MUSA_1D_KERNEL_LOOP(index, num_bbox1) { const int b1 = index; const int b2 = index; const int base1 = b1 << 2; // b1 * 4 T b1_x1, b1_y1, b1_x2, b1_y2; load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); const int base2 = b2 << 2; // b2 * 4 T b2_x1, b2_y1, b2_x2, b2_y2; load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); const T width = fmaxf(right - left + offset, 0.f); const T height = fmaxf(bottom - top + offset, 0.f); const T interS = width * height; const T baseS = fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); ious[index] = interS / baseS; } } else { MUSA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) { const int b1 = index / num_bbox2; const int b2 = index % num_bbox2; const int base1 = b1 << 2; // b1 * 4 T b1_x1, b1_y1, b1_x2, b1_y2; load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); const int base2 = b2 << 2; // b2 * 4 T b2_x1, b2_y1, b2_x2, b2_y2; load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); const T width = fmaxf(right - left + offset, 0.f); const T height = fmaxf(bottom - top + offset, 0.f); const T interS = width * height; const T baseS = fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); ious[index] = interS / baseS; } } } __device__ __forceinline__ __half __half_area(const __half x1, const __half y1, const __half x2, const __half y2, const __half offset) { const __half half_w = __hadd(__hsub(x2, x1), offset); const __half half_h = __hadd(__hsub(y2, y1), offset); return __hmul(half_w, half_h); } __device__ __forceinline__ __half __half_max(const __half a, const __half b) { return __hge(a, b) ? a : b; } __device__ __forceinline__ __half __half_min(const __half a, const __half b) { return __hle(a, b) ? a : b; } // fp16 won't provide much increase when aligned==true. It is useful when // aligned==false, which would give you ~40% bonus. __device__ void bbox_overlaps_musa_kernel_half( const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1, const int num_bbox2, const int mode, const bool aligned, const int offset) { const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2; const __half h_offset = __int2half_rn(offset); MUSA_1D_KERNEL_LOOP(index, num_output) { const int b1 = aligned ? index : index / num_bbox2; const int b2 = aligned ? index : index % num_bbox2; const int base1 = b1 << 2; __half b1_x1, b1_y1, b1_x2, b1_y2; load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset); const int base2 = b2 << 2; __half b2_x1, b2_y1, b2_x2, b2_y2; load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset); const __half left = __half_max(b1_x1, b2_x1), right = __half_min(b1_x2, b2_x2); const __half top = __half_max(b1_y1, b2_y1), bottom = __half_min(b1_y2, b2_y2); const __half width = __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f)); const __half height = __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f)); const __half interS = __hmul(width, height); const __half baseS = __half_max( mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area, h_offset); ious[index] = __hdiv(interS, baseS); } } #endif // BBOX_OVERLAPS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/bezier_align_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/csrc/BezierAlign/BezierAlign_cuda.cu #ifndef BEZIER_ALIGN_MUSA_KERNEL_MUH #define BEZIER_ALIGN_MUSA_KERNEL_MUH #include #include "pytorch_musa_helper.hpp" template __device__ T bezier_curve(const T p0, const T p1, const T p2, const T p3, const T u) { return ((1. - u) * (1. - u) * (1. - u) * p0 + 3. * u * (1. - u) * (1. - u) * p1 + 3. * u * u * (1. - u) * p2 + u * u * u * p3); } template __global__ void bezier_align_forward_musa_kernel( const int nthreads, const T *bottom_data, // inputs const T *bottom_rois, // bottom rois contains the bezier curve T *top_data, // outputs const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, bool aligned, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; // beziers have size Nx(1+8*2) = Nx17 const T *offset_bottom_rois = bottom_rois + n * 17; int roi_batch_ind = offset_bottom_rois[0]; // Do not use rounding; this implementation detail is critical T offset = aligned ? (T)0.5 : (T)0.0; // TODO: avoid this by using parallel annotation, for good T p0_x = offset_bottom_rois[1] * spatial_scale; T p0_y = offset_bottom_rois[2] * spatial_scale; T p1_x = offset_bottom_rois[3] * spatial_scale; T p1_y = offset_bottom_rois[4] * spatial_scale; T p2_x = offset_bottom_rois[5] * spatial_scale; T p2_y = offset_bottom_rois[6] * spatial_scale; T p3_x = offset_bottom_rois[7] * spatial_scale; T p3_y = offset_bottom_rois[8] * spatial_scale; T p4_x = offset_bottom_rois[15] * spatial_scale; T p4_y = offset_bottom_rois[16] * spatial_scale; T p5_x = offset_bottom_rois[13] * spatial_scale; T p5_y = offset_bottom_rois[14] * spatial_scale; T p6_x = offset_bottom_rois[11] * spatial_scale; T p6_y = offset_bottom_rois[12] * spatial_scale; T p7_x = offset_bottom_rois[9] * spatial_scale; T p7_y = offset_bottom_rois[10] * spatial_scale; // compute the coords const T u = pw / static_cast(pooled_width); const T v = ph / static_cast(pooled_height); const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u); const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u); const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u); const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u); const T x_center = x1 * v + x0 * (1. - v) - offset; const T y_center = y1 * v + y0 * (1. - v) - offset; T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x)); T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y)); if (!aligned) { // for backward-compatibility only roi_width = max(roi_width, (T)1.); roi_height = max(roi_height, (T)1.); } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin // When the grid is empty, output zeros == 0/1, instead of NaN. const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 { const T y = y_center - (T)0.5 * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = x_center - (T)0.5 * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, index); output_val += val; } } output_val /= count; top_data[index] = output_val; } } template __global__ void bezier_align_backward_musa_kernel( const int nthreads, const T *top_diff, const T *bottom_rois, T *bottom_diff, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, bool aligned, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; // beziers have size Nx(1+8*2) = Nx17 const T *offset_bottom_rois = bottom_rois + n * 17; int roi_batch_ind = offset_bottom_rois[0]; // Do not use rounding; this implementation detail is critical T offset = aligned ? (T)0.5 : (T)0.0; T p0_x = offset_bottom_rois[1] * spatial_scale; T p0_y = offset_bottom_rois[2] * spatial_scale; T p1_x = offset_bottom_rois[3] * spatial_scale; T p1_y = offset_bottom_rois[4] * spatial_scale; T p2_x = offset_bottom_rois[5] * spatial_scale; T p2_y = offset_bottom_rois[6] * spatial_scale; T p3_x = offset_bottom_rois[7] * spatial_scale; T p3_y = offset_bottom_rois[8] * spatial_scale; T p4_x = offset_bottom_rois[15] * spatial_scale; T p4_y = offset_bottom_rois[16] * spatial_scale; T p5_x = offset_bottom_rois[13] * spatial_scale; T p5_y = offset_bottom_rois[14] * spatial_scale; T p6_x = offset_bottom_rois[11] * spatial_scale; T p6_y = offset_bottom_rois[12] * spatial_scale; T p7_x = offset_bottom_rois[9] * spatial_scale; T p7_y = offset_bottom_rois[10] * spatial_scale; // compute the coords const T u = pw / static_cast(pooled_width); const T v = ph / static_cast(pooled_height); const T x0 = bezier_curve(p0_x, p1_x, p2_x, p3_x, u); const T y0 = bezier_curve(p0_y, p1_y, p2_y, p3_y, u); const T x1 = bezier_curve(p4_x, p5_x, p6_x, p7_x, u); const T y1 = bezier_curve(p4_y, p5_y, p6_y, p7_y, u); const T x_center = x1 * v + x0 * (1. - v) - offset; const T y_center = y1 * v + y0 * (1. - v) - offset; T roi_width = max(abs(p0_x - p3_x), abs(p4_x - p7_x)); T roi_height = max(abs(p0_y - p3_y), abs(p4_y - p7_y)); if (!aligned) { // for backward-compatibility only roi_width = max(roi_width, (T)1.); roi_height = max(roi_height, (T)1.); } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); T *offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const T *offset_top_diff = top_diff + top_offset; const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 { const T y = y_center - (T)0.5 * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = x_center - (T)0.5 * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); T g1 = top_diff_this_bin * w1 / count; T g2 = top_diff_this_bin * w2 / count; T g3 = top_diff_this_bin * w3 / count; T g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, static_cast(g1)); atomicAdd(offset_bottom_diff + y_low * width + x_high, static_cast(g2)); atomicAdd(offset_bottom_diff + y_high * width + x_low, static_cast(g3)); atomicAdd(offset_bottom_diff + y_high * width + x_high, static_cast(g4)); } // if } // ix } // iy } // MUSA_1D_KERNEL_LOOP } // BezierAlignBackward #endif // BEZIER_ALIGN_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/border_align_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved // modified from // https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu. // the main difference: (1) use `argmax_idx` for fast computing of gradient // during the backward. (2) `wh` is directly computed by `boxes`, rather than // passing it as argument to forward or backward functions. #ifndef BORDER_ALIGN_MUSA_KERNEL_MUH #define BORDER_ALIGN_MUSA_KERNEL_MUH #include #include "pytorch_musa_helper.hpp" enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 }; /*** Forward ***/ template __global__ void border_align_forward_musa_kernel( const int nthreads, const T* input, const T* boxes, T* output, int* argmax_idx, const int channels, const int box_size, const int height, const int width, const int pool_size) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (batch_idx, c_idx, box_idx) is an element paralleled for computing // output, and `extreme_idx` is in range [0,3] int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx; const T *offset_box, *offset_input, *offset_box_x; T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y, val, maxval; extreme_idx = threadIdx.y; // shape (N, C, box_size, 4) for output batch_idx = index / channels / box_size; // shape (N, box_size, 4) for boxes box_idx = index % box_size + batch_idx * box_size; c_idx = (index / box_size) % channels; offset_box = boxes + box_idx * 4; box_width = *(offset_box + 2) - *offset_box; box_height = *(offset_box + 3) - *(offset_box + 1); offset_output = output + index * 4 + extreme_idx; offset_argmax_idx = argmax_idx + index * 4 + extreme_idx; // shape (N, 4C, h, w) for input. // [0,C) for top feature, [C,2C) for left feature, // [2C,3C) for bottom feature, [3C,4C) for right feature offset_input = input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) * height * width; // extreme_idx in [0,1] -> offset_box_x indexed at x1 // extreme_idx in [2,3] -> offset_box_x indexed at x2 offset_box_x = offset_box + extreme_idx / 2 * 2; // (x1,y1) or (x2,y2) for (x,y) x = *offset_box_x; y = *(offset_box_x + 1); switch (extreme_idx) { // top case BorderMode::Top: stride = box_width / pool_size; x_stride = stride; y_stride = 0; break; // left case BorderMode::Left: stride = box_height / pool_size; x_stride = 0; y_stride = stride; break; // bottom case BorderMode::Bottom: stride = box_width / pool_size; x_stride = -stride; y_stride = 0; break; // right case BorderMode::Right: stride = box_height / pool_size; x_stride = 0; y_stride = -stride; break; } // initialize maxval and maxidx with the start position (e.g. (x1,y1) or // (x2,y2)) maxval = bilinear_interpolate(offset_input, height, width, y, x, index); maxidx = 0; // do max_pool along the border for (int i = 1; i <= pool_size; i++) { x += x_stride; y += y_stride; val = bilinear_interpolate(offset_input, height, width, y, x, index); if (val > maxval) { maxval = val; maxidx = i; } } // update output and argmax_idx *offset_output = maxval; *offset_argmax_idx = maxidx; } } /*** Backward ***/ template __global__ void border_align_backward_musa_kernel( const int nthreads, const T* grad_output, const T* boxes, const int* argmax_idx, T* grad_input, const int channels, const int box_size, const int height, const int width, const int pool_size) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (batch_idx, c_idx, box_idx) is an element paralleled for computing // output, and `extreme_idx` is in range [0,3] int batch_idx, c_idx, box_idx, extreme_idx; const int* offset_argmax_idx; const T *offset_grad_output, *offset_box, *offset_box_x; T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x, y; extreme_idx = threadIdx.y; batch_idx = index / channels / box_size; box_idx = index % box_size + batch_idx * box_size; c_idx = (index / box_size) % channels; offset_box = boxes + box_idx * 4; box_width = *(offset_box + 2) - *offset_box; box_height = *(offset_box + 3) - *(offset_box + 1); offset_grad_output = grad_output + index * 4 + extreme_idx; offset_argmax_idx = argmax_idx + index * 4 + extreme_idx; // [0,C) for top feature grad, [C,2C) for left feature grad, // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad offset_grad_input = grad_input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) * height * width; // extreme_idx in [0,1] -> offset_box_x indexed at x1 // extreme_idx in [2,3] -> offset_box_x indexed at x2 offset_box_x = offset_box + extreme_idx / 2 * 2; switch (extreme_idx) { // top case BorderMode::Top: stride = box_width / pool_size; x_stride = stride; y_stride = 0; break; // left case BorderMode::Left: stride = box_height / pool_size; x_stride = 0; y_stride = stride; break; // bottom case BorderMode::Bottom: stride = box_width / pool_size; x_stride = -stride; y_stride = 0; break; // right case BorderMode::Right: stride = box_height / pool_size; x_stride = 0; y_stride = -stride; break; } // get position (x,y) which has maximum value during forward x = *offset_box_x; y = *(offset_box_x + 1); x += x_stride * (T)(*offset_argmax_idx); y += y_stride * (T)(*offset_argmax_idx); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); // update grad_output atomicAdd(offset_grad_input + y_low * width + x_low, *offset_grad_output * w1); atomicAdd(offset_grad_input + y_low * width + x_high, *offset_grad_output * w2); atomicAdd(offset_grad_input + y_high * width + x_low, *offset_grad_output * w3); atomicAdd(offset_grad_input + y_high * width + x_high, *offset_grad_output * w4); } } #endif // BORDER_ALIGN_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/box_iou_quadri_musa.muh ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved #ifndef BOX_IOU_QUADRI_MUSA_MUH #define BOX_IOU_QUADRI_MUSA_MUH #include "pytorch_musa_helper.hpp" #include "box_iou_rotated_utils.hpp" // 2D block with 32 * 16 = 512 threads per block const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 16; inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } template __global__ void box_iou_quadri_musa_kernel( const int n_boxes1, const int n_boxes2, const T* dev_boxes1, const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) { if (aligned) { MUSA_1D_KERNEL_LOOP(index, n_boxes1) { int b1 = index; int b2 = index; int base1 = b1 * 8; float block_boxes1[8]; float block_boxes2[8]; block_boxes1[0] = dev_boxes1[base1 + 0]; block_boxes1[1] = dev_boxes1[base1 + 1]; block_boxes1[2] = dev_boxes1[base1 + 2]; block_boxes1[3] = dev_boxes1[base1 + 3]; block_boxes1[4] = dev_boxes1[base1 + 4]; block_boxes1[5] = dev_boxes1[base1 + 5]; block_boxes1[6] = dev_boxes1[base1 + 6]; block_boxes1[7] = dev_boxes1[base1 + 7]; int base2 = b2 * 8; block_boxes2[0] = dev_boxes2[base2 + 0]; block_boxes2[1] = dev_boxes2[base2 + 1]; block_boxes2[2] = dev_boxes2[base2 + 2]; block_boxes2[3] = dev_boxes2[base2 + 3]; block_boxes2[4] = dev_boxes2[base2 + 4]; block_boxes2[5] = dev_boxes2[base2 + 5]; block_boxes2[6] = dev_boxes2[base2 + 6]; block_boxes2[7] = dev_boxes2[base2 + 7]; dev_ious[index] = single_box_iou_quadri(block_boxes1, block_boxes2, mode_flag); } } else { MUSA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) { int b1 = index / n_boxes2; int b2 = index % n_boxes2; int base1 = b1 * 8; float block_boxes1[8]; float block_boxes2[8]; block_boxes1[0] = dev_boxes1[base1 + 0]; block_boxes1[1] = dev_boxes1[base1 + 1]; block_boxes1[2] = dev_boxes1[base1 + 2]; block_boxes1[3] = dev_boxes1[base1 + 3]; block_boxes1[4] = dev_boxes1[base1 + 4]; block_boxes1[5] = dev_boxes1[base1 + 5]; block_boxes1[6] = dev_boxes1[base1 + 6]; block_boxes1[7] = dev_boxes1[base1 + 7]; int base2 = b2 * 8; block_boxes2[0] = dev_boxes2[base2 + 0]; block_boxes2[1] = dev_boxes2[base2 + 1]; block_boxes2[2] = dev_boxes2[base2 + 2]; block_boxes2[3] = dev_boxes2[base2 + 3]; block_boxes2[4] = dev_boxes2[base2 + 4]; block_boxes2[5] = dev_boxes2[base2 + 5]; block_boxes2[6] = dev_boxes2[base2 + 6]; block_boxes2[7] = dev_boxes2[base2 + 7]; dev_ious[index] = single_box_iou_quadri(block_boxes1, block_boxes2, mode_flag); } } } #endif ================================================ FILE: mmcv/ops/csrc/common/musa/box_iou_rotated_musa.muh ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // modified from // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu #ifndef BOX_IOU_ROTATED_MUSA_MUH #define BOX_IOU_ROTATED_MUSA_MUH #include "pytorch_musa_helper.hpp" #include "box_iou_rotated_utils.hpp" // 2D block with 32 * 16 = 512 threads per block const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 16; inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } template __global__ void box_iou_rotated_musa_kernel( const int n_boxes1, const int n_boxes2, const T* dev_boxes1, const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) { if (aligned) { MUSA_1D_KERNEL_LOOP(index, n_boxes1) { int b1 = index; int b2 = index; int base1 = b1 * 5; float block_boxes1[5]; float block_boxes2[5]; block_boxes1[0] = dev_boxes1[base1 + 0]; block_boxes1[1] = dev_boxes1[base1 + 1]; block_boxes1[2] = dev_boxes1[base1 + 2]; block_boxes1[3] = dev_boxes1[base1 + 3]; block_boxes1[4] = dev_boxes1[base1 + 4]; int base2 = b2 * 5; block_boxes2[0] = dev_boxes2[base2 + 0]; block_boxes2[1] = dev_boxes2[base2 + 1]; block_boxes2[2] = dev_boxes2[base2 + 2]; block_boxes2[3] = dev_boxes2[base2 + 3]; block_boxes2[4] = dev_boxes2[base2 + 4]; dev_ious[index] = single_box_iou_rotated(block_boxes1, block_boxes2, mode_flag); } } else { MUSA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) { int b1 = index / n_boxes2; int b2 = index % n_boxes2; int base1 = b1 * 5; float block_boxes1[5]; float block_boxes2[5]; block_boxes1[0] = dev_boxes1[base1 + 0]; block_boxes1[1] = dev_boxes1[base1 + 1]; block_boxes1[2] = dev_boxes1[base1 + 2]; block_boxes1[3] = dev_boxes1[base1 + 3]; block_boxes1[4] = dev_boxes1[base1 + 4]; int base2 = b2 * 5; block_boxes2[0] = dev_boxes2[base2 + 0]; block_boxes2[1] = dev_boxes2[base2 + 1]; block_boxes2[2] = dev_boxes2[base2 + 2]; block_boxes2[3] = dev_boxes2[base2 + 3]; block_boxes2[4] = dev_boxes2[base2 + 4]; dev_ious[index] = single_box_iou_rotated(block_boxes1, block_boxes2, mode_flag); } } } #endif ================================================ FILE: mmcv/ops/csrc/common/musa/carafe_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef CARAFE_MUSA_KERNEL_MUH #define CARAFE_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" #ifdef MMCV_WITH_HIP #define WARP_SIZE 64 #else #define WARP_SIZE 32 #endif #define THREADS_PER_PIXEL 32 #define MAX_SHARED_MEMORY 49152 #define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144 #define MAXIMIZE_KERNEL_SIZE true #define kTileDim 32 #define kBlockRows 8 #define FULL_MASK 0xffffffff inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } __device__ inline int Loc2Index(const int n, const int c, const int h, const int w, const int channel_num, const int height, const int width) { int index = w + (h + (c + n * channel_num) * height) * width; return index; } #ifndef MMCV_WITH_HIP /* TODO: move this to a common place */ template __device__ inline scalar_t min(scalar_t a, scalar_t b) { return a < b ? a : b; } template __device__ inline scalar_t max(scalar_t a, scalar_t b) { return a > b ? a : b; } #endif template __device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) { for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) #ifdef MMCV_WITH_HIP val += __shfl_down(val, offset); #else val += __shfl_down_sync(FULL_MASK, val, offset); #endif return val; } template <> __device__ __forceinline__ phalf warpReduceSum(phalf val) { for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) #ifdef MMCV_WITH_HIP // Using PyTorch's macro for half support __PHALF(val) += WARP_SHFL_DOWN(val, offset); #else __PHALF(val) += __shfl_down_sync(FULL_MASK, __PHALF(val).operator __half(), offset); #endif return val; } // Splits the original matrix into submatrices with size 32 * 32. // Each block transposes one submatrix by loading it into shared memory. // Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/ template __global__ void BatchTranspose2DMUSAKernel(const int N, const int H, const int W, const int dh, const int dw, const scalar_t *__restrict__ X, scalar_t *__restrict__ Y) { __shared__ scalar_t tile[kTileDim][kTileDim + 1]; const int n = blockIdx.x / (dh * dw); const int k = blockIdx.x % (dh * dw); const int r = k / dw; const int c = k % dw; const int offset = n * H * W; int x = c * kTileDim + threadIdx.x; int y = r * kTileDim + threadIdx.y; if (x < W) { for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) { tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x]; } } __syncthreads(); x = r * kTileDim + threadIdx.x; y = c * kTileDim + threadIdx.y; if (x < H) { for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) { Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i]; } } } template __global__ void CARAFEForward( const int num_kernels, const scalar_t *__restrict__ bottom_data, const scalar_t *__restrict__ bottom_masks, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int down_height, const int down_width, const int height, const int width, const int mask_channels, scalar_t *__restrict__ top_data) { #if MAXIMIZE_KERNEL_SIZE __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2]; #else __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T]; #endif int index = threadIdx.x + blockIdx.x * blockDim.x; if (index > num_kernels - 1) { return; } const int pixel_id = threadIdx.x / THREADS_PER_PIXEL; const int split_id = threadIdx.x % THREADS_PER_PIXEL; index = index / THREADS_PER_PIXEL; const int pw = index % width; const int ph = (index / width) % height; const int n = index / width / height; const int down_pw = pw / scale_factor; const int down_ph = ph / scale_factor; const int start_w = down_pw - (kernel_size - 1) / 2; const int end_w = down_pw + (kernel_size - 1) / 2 + 1; const int start_h = down_ph - (kernel_size - 1) / 2; const int end_h = down_ph + (kernel_size - 1) / 2 + 1; for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) { int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels); shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index]; } __syncthreads(); const int channels_per_group = ceilf(channels / (float)group_size); #pragma unroll for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { int mask_group = c / channels_per_group; scalar_t output_val = 0; #pragma unroll for (int iy = start_h; iy < end_h; iy++) { #pragma unroll for (int ix = start_w; ix < end_w; ix++) { if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { continue; } int mask_iy = iy - down_ph + (kernel_size - 1) / 2; int mask_ix = ix - down_pw + (kernel_size - 1) / 2; int mask_c = (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; int feat_index = Loc2Index(n, iy, ix, c, down_height, down_width, channels); output_val += bottom_data[feat_index] * shared_mask[mask_c * WARP_SIZE + pixel_id]; } } int top_index = Loc2Index(n, ph, pw, c, height, width, channels); top_data[top_index] = output_val; } } template __global__ void CARAFEBackward_Feature( const int num_kernels, const scalar_t *__restrict__ top_diff, const scalar_t *__restrict__ bottom_masks, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int down_height, const int down_width, const int height, const int width, const int mask_channels, scalar_t *__restrict__ bottom_diff) { #if MAXIMIZE_KERNEL_SIZE __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2]; #else __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T]; #endif int index = threadIdx.x + blockIdx.x * blockDim.x; if (index > num_kernels - 1) { return; } const int pixel_id = threadIdx.x / THREADS_PER_PIXEL; const int split_id = threadIdx.x % THREADS_PER_PIXEL; // (n, c, ph, pw) is an element in the bottom_data index = index / THREADS_PER_PIXEL; const int pw = index % width; const int ph = (index / width) % height; const int n = index / width / height; const int start_w = pw - (kernel_size - 1) * scale_factor / 2; const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1; const int start_h = ph - (kernel_size - 1) * scale_factor / 2; const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1; for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) { const int mask_w = (c % kernel_size) * scale_factor; const int mask_h = (c / kernel_size % kernel_size) * scale_factor; const int mask_x = start_w + mask_w; const int mask_y = start_h + mask_h; if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) { shared_mask[c * WARP_SIZE + pixel_id] = 0; continue; } const int mask_group = c / (kernel_size * kernel_size); const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1; int mask_index = Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width); shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index]; } __syncthreads(); const int channels_per_group = ceilf(channels / (float)group_size); #pragma unroll for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { int mask_group = c / channels_per_group; int top_index = Loc2Index(n, ph, pw, c, height, width, channels); scalar_t output_val = 0; #pragma unroll for (int iy = start_h; iy < end_h; iy += scale_factor) { #pragma unroll for (int ix = start_w; ix < end_w; ix += scale_factor) { if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) { continue; } int mask_iy = (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor; int mask_ix = (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor; int mask_c = (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; int feat_index = Loc2Index(n, iy, ix, c, height, width, channels); output_val += shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index]; } } bottom_diff[top_index] = output_val; } } template __global__ void FeatureSum(const int num_kernels, const scalar_t *__restrict__ input_data, const int scale_factor, const int channels, const int height, const int width, scalar_t *__restrict__ output_data) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index > num_kernels - 1) { return; } const int split_id = threadIdx.x % THREADS_PER_PIXEL; index = index / THREADS_PER_PIXEL; const int pw = index % width; const int ph = (index / width) % height; const int n = index / width / height; for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) { scalar_t output_val = 0; for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) { for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) { int input_id = Loc2Index(n, iy, ix, c, height * scale_factor, width * scale_factor, channels); output_val += input_data[input_id]; } } const int output_id = Loc2Index(n, ph, pw, c, height, width, channels); output_data[output_id] = output_val; } } template __global__ void CARAFEBackward_Mask(const int num_kernels, const scalar_t *__restrict__ top_diff, const scalar_t *__restrict__ bottom_data, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int down_height, const int down_width, const int height, const int width, const int mask_channels, scalar_t *__restrict__ mask_diff) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index > num_kernels - 1) { return; } const int lane_id = index % WARP_SIZE; index = index / WARP_SIZE; const int mask_c = index % mask_channels; // (n, c, ph, pw) is an element in the bottom_data index = index / mask_channels; const int pw = index % width; const int ph = (index / width) % height; const int n = index / width / height; const int down_pw = pw / scale_factor; const int down_ph = ph / scale_factor; const int mask_group = mask_c / (kernel_size * kernel_size); const int mask_loc = mask_c % (kernel_size * kernel_size); const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2; const int offset_y = mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2; const int down_x = down_pw + offset_x; const int down_y = down_ph + offset_y; scalar_t output_val = 0; if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 && down_x <= down_width - 1) { const int channels_per_mask = ceilf(channels / (float)group_size); const int start = channels_per_mask * mask_group; const int end = min(channels_per_mask * (mask_group + 1), channels); for (int c = start + lane_id; c < end; c += WARP_SIZE) { int bottom_id = Loc2Index(n, down_y, down_x, c, down_height, down_width, channels); int top_id = Loc2Index(n, ph, pw, c, height, width, channels); output_val += top_diff[top_id] * bottom_data[bottom_id]; } } #ifdef MMCV_WITH_HIP __syncthreads(); #else __syncwarp(); #endif output_val = warpReduceSum(output_val); if (lane_id == 0) { const int mask_id = Loc2Index(n, ph, pw, mask_c, height, width, mask_channels); mask_diff[mask_id] = output_val; } } #endif // CARAFE_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/carafe_naive_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef CARAFE_NAIVE_MUSA_KERNEL_MUH #define CARAFE_NAIVE_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" __device__ inline int Loc2Index(const int n, const int c, const int h, const int w, const int channel_num, const int height, const int width) { int index = w + (h + (c + n * channel_num) * height) * width; return index; } template __global__ void carafe_naive_forward_musa_kernel( const int nthreads, const scalar_t *bottom_data, const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the bottom_data int pw = index % width; int ph = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; int mask_channels = kernel_size * kernel_size * group_size; int mask_group = c / (channels / group_size); int down_pw = pw / scale_factor; int down_ph = ph / scale_factor; int down_width = width / scale_factor; int down_height = height / scale_factor; int start_w = down_pw - (kernel_size - 1) / 2; int end_w = down_pw + (kernel_size - 1) / 2 + 1; int start_h = down_ph - (kernel_size - 1) / 2; int end_h = down_ph + (kernel_size - 1) / 2 + 1; scalar_t output_val = 0; for (int iy = start_h; iy < end_h; iy++) { for (int ix = start_w; ix < end_w; ix++) { if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { continue; } int mask_iy = iy - down_ph + (kernel_size - 1) / 2; int mask_ix = ix - down_pw + (kernel_size - 1) / 2; int mask_c = (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; int feat_index = Loc2Index(n, c, iy, ix, channels, down_height, down_width); int mask_index = Loc2Index(n, mask_c, ph, pw, mask_channels, height, width); output_val += bottom_data[feat_index] * bottom_masks[mask_index]; } } top_data[index] = output_val; } } template __global__ void carafe_naive_backward_musa_kernel( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data, const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff, const int kernel_size, const int group_size, const int scale_factor, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the bottom_data int pw = index % width; int ph = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; int mask_channels = kernel_size * kernel_size * group_size; int mask_group = c / (channels / group_size); int down_pw = pw / scale_factor; int down_ph = ph / scale_factor; int down_width = width / scale_factor; int down_height = height / scale_factor; int start_w = down_pw - (kernel_size - 1) / 2; int end_w = down_pw + (kernel_size - 1) / 2 + 1; int start_h = down_ph - (kernel_size - 1) / 2; int end_h = down_ph + (kernel_size - 1) / 2 + 1; for (int iy = start_h; iy < end_h; iy++) { for (int ix = start_w; ix < end_w; ix++) { if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) { continue; } int mask_iy = iy - down_ph + (kernel_size - 1) / 2; int mask_ix = ix - down_pw + (kernel_size - 1) / 2; int mask_c = (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix; int feat_index = Loc2Index(n, c, iy, ix, channels, down_height, down_width); int mask_index = Loc2Index(n, mask_c, ph, pw, mask_channels, height, width); atomicAdd(bottom_diff + feat_index, bottom_masks[mask_index] * top_diff[index]); atomicAdd(mask_diff + mask_index, bottom_data[feat_index] * top_diff[index]); } } } } #endif // CARAFE_NAIVE_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/chamfer_distance_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu #ifndef CHAMFER_DISTANCE_MUSA_KERNEL_MUH #define CHAMFER_DISTANCE_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" #define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144 #if MUSA_ARCH > 21 template __global__ void chamfer_distance_forward_musa_kernel(int b, int n, const scalar_t* xyz, int m, const scalar_t* xyz2, scalar_t* result, int* result_i) { __shared__ scalar_t buf[MAX_SHARED_SCALAR_T]; for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) { int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2; for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) { buf[j] = xyz2[(i * m + k2) * 2 + j]; } __syncthreads(); for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) { scalar_t x1 = xyz[(i * n + j) * 2 + 0]; scalar_t y1 = xyz[(i * n + j) * 2 + 1]; int best_i = 0; scalar_t best = 1e10; int end_ka = end_k & (~3); if (end_ka == THREADS_PER_BLOCK) { for (int k = 0; k < THREADS_PER_BLOCK; k += 4) { #pragma unroll for (int j = 0; j < 4; ++j) { scalar_t x2 = buf[(k + j) * 2] - x1; scalar_t y2 = buf[(k + j) * 2 + 1] - y1; scalar_t d = x2 * x2 + y2 * y2; if (d < best) { best = d; best_i = k + k2 + j; } } } } else { for (int k = 0; k < end_ka; k += 4) { #pragma unroll for (int j = 0; j < 4; ++j) { scalar_t x2 = buf[(k + j) * 2] - x1; scalar_t y2 = buf[(k + j) * 2 + 1] - y1; scalar_t d = x2 * x2 + y2 * y2; if (d < best) { best = d; best_i = k + k2 + j; } } } } for (int k = end_ka; k < end_k; k++) { scalar_t x2 = buf[k * 2 + 0] - x1; scalar_t y2 = buf[k * 2 + 1] - y1; scalar_t d = x2 * x2 + y2 * y2; if (k == 0 || d < best) { best = d; best_i = k + k2; } } if (k2 == 0 || result[(i * n + j)] > best) { result[(i * n + j)] = best; result_i[(i * n + j)] = best_i; } } __syncthreads(); } } } template __global__ void chamfer_distance_backward_musa_kernel( int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2, const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1, scalar_t* grad_xyz2) { for (int i = blockIdx.x; i < b; i += gridDim.x) { for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) { scalar_t x1 = xyz1[(i * n + j) * 2 + 0]; scalar_t y1 = xyz1[(i * n + j) * 2 + 1]; int j2 = idx1[i * n + j]; scalar_t x2 = xyz2[(i * m + j2) * 2 + 0]; scalar_t y2 = xyz2[(i * m + j2) * 2 + 1]; scalar_t g = grad_dist1[i * n + j] * 2; atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2)); atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2)); atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2))); atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2))); } } } #else #warning "chamfer_distance is supported when MUSA_ARCH > 21" #endif //MUSA_ARCH #endif // CHAMFER_DISTANCE_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/common_musa_helper.hpp ================================================ #ifndef COMMON_MUSA_HELPER #define COMMON_MUSA_HELPER #include #define MUSA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) #define MUSA_2D_KERNEL_LOOP(i, n, j, m) \ for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) \ for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \ j += blockDim.y * gridDim.y) #define MUSA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \ for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \ for (size_t j = blockIdx.y; j < (m); j += gridDim.y) #define THREADS_PER_BLOCK 512 inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) { int optimal_block_num = (N + num_threads - 1) / num_threads; int max_block_num = 4096; return min(optimal_block_num, max_block_num); } template __device__ T bilinear_interpolate(const T* input, const int height, const int width, T y, T x, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) return 0; if (y <= 0) y = 0; if (x <= 0) x = 0; int y_low = (int)y; int x_low = (int)x; int y_high; int x_high; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // do bilinear interpolation T v1 = input[y_low * width + x_low]; T v2 = input[y_low * width + x_high]; T v3 = input[y_high * width + x_low]; T v4 = input[y_high * width + x_high]; T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void bilinear_interpolate_gradient( const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4, int& x_low, int& x_high, int& y_low, int& y_high, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (y < -1.0 || y > height || x < -1.0 || x > width) { // empty w1 = w2 = w3 = w4 = 0.; x_low = x_high = y_low = y_high = -1; return; } if (y <= 0) y = 0; if (x <= 0) x = 0; y_low = (int)y; x_low = (int)x; if (y_low >= height - 1) { y_high = y_low = height - 1; y = (T)y_low; } else { y_high = y_low + 1; } if (x_low >= width - 1) { x_high = x_low = width - 1; x = (T)x_low; } else { x_high = x_low + 1; } T ly = y - y_low; T lx = x - x_low; T hy = 1. - ly, hx = 1. - lx; // reference in forward // T v1 = input[y_low * width + x_low]; // T v2 = input[y_low * width + x_high]; // T v3 = input[y_high * width + x_low]; // T v4 = input[y_high * width + x_high]; // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; return; } #endif // COMMON_MUSA_HELPER ================================================ FILE: mmcv/ops/csrc/common/musa/convex_iou_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef CONVEX_IOU_MUSA_KERNEL_MUH #define CONVEX_IOU_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" #define MAXN 100 #define NMAX 512 __device__ const double EPS = 1E-8; __device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); } struct Point { double x, y; __device__ Point() {} __device__ Point(double x, double y) : x(x), y(y) {} }; __device__ inline bool point_same(Point& a, Point& b) { return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0; } __device__ inline void swap1(Point* a, Point* b) { Point temp; temp.x = a->x; temp.y = a->y; a->x = b->x; a->y = b->y; b->x = temp.x; b->y = temp.y; } __device__ inline void reverse1(Point* a, const int n) { for (int i = 0; i < (n - 1) / 2.0; i++) { Point* j = &(a[i]); Point* k = &(a[n - 1 - i]); swap1(j, k); } } __device__ inline double cross(Point o, Point a, Point b) { return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y); } __device__ inline double dis(Point a, Point b) { return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y); } __device__ inline double area(Point* ps, int n) { ps[n] = ps[0]; double res = 0; for (int i = 0; i < n; i++) { res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x; } return res / 2.0; } __device__ inline double polygon_area_grad(Point* ps, int n, int* polygon_to_pred_index, int n_pred, double* grad_C) { ps[n] = ps[0]; double partion_grad[4 * 30 + 2]; double res = 0; for (int i = 0; i < n; i++) { res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x; partion_grad[i * 4 + 2] = ps[i + 1].y; partion_grad[i * 4 + 3] = -ps[i + 1].x; if (i != n - 1) { partion_grad[i * 4 + 4] = -ps[i].y; partion_grad[i * 4 + 5] = ps[i].x; } else { partion_grad[0] = -ps[i].y; partion_grad[1] = ps[i].x; } } for (int i = 0; i < n; i++) { for (int j = 0; j < n_pred; j++) { if (i == polygon_to_pred_index[j]) { grad_C[2 * polygon_to_pred_index[j + n_pred]] = (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2; break; } } for (int j = 0; j < n_pred; j++) { if (i == polygon_to_pred_index[j]) { grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] = (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2; break; } } } return res / 2.0; } __device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p, double* cut_grad, int m, int n, int i) { double s1, s2; double s2_s1_2; double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd; double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd; s1 = cross(a, b, c); s2 = cross(a, b, d); ds1_dxc = -(b.y - a.y); ds1_dyc = b.x - a.x; ds2_dxd = ds1_dxc; ds2_dyd = ds1_dyc; s2_s1_2 = (s2 - s1) * (s2 - s1); if (sig(s1) == 0 && sig(s2) == 0) return 2; if (sig(s2 - s1) == 0) return 0; dxp_dxc = ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) / (s2_s1_2); dxp_dyc = ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) / (s2_s1_2); dxp_dxd = ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) / (s2_s1_2); dxp_dyd = ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) / (s2_s1_2); dyp_dxc = ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) / (s2_s1_2); dyp_dyc = ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) / (s2_s1_2); dyp_dxd = ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) / (s2_s1_2); dyp_dyd = ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) / (s2_s1_2); p.x = (c.x * s2 - d.x * s1) / (s2 - s1); p.y = (c.y * s2 - d.y * s1) / (s2 - s1); if (i == n - 1) { cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc; cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc; cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc; cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc; cut_grad[4 * n * m + 0] = dxp_dxd; // + dyp_dxd; cut_grad[4 * n * m + 1] = dyp_dxd; cut_grad[4 * n * m + 2] = dxp_dyd; // + dyp_dyd; cut_grad[4 * n * m + 3] = dyp_dyd; } else { cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc; cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc; cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc; cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc; cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd; // + dyp_dxd; cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd; cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd; // + dyp_dyd; cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd; } return 1; } __device__ inline void polygon_cut(Point* p, int& n, Point a, Point b, double* cut_grad) { Point pp[MAXN]; double ccur_grad[MAXN] = {}; int m = 0; p[n] = p[0]; int k = n; for (int i = 0; i < n; i++) { if (sig(cross(a, b, p[i])) > 0) { pp[m] = p[i]; ccur_grad[4 * n * m + 4 * i] = 1.0; ccur_grad[4 * n * m + 4 * i + 3] = 1.0; m++; } if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) { lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i); m++; } } n = 0; for (int i = 0; i < m; i++) { if (!i || !(point_same(pp[i], pp[i - 1]))) { p[n] = pp[i]; for (int j = 0; j < 4 * k; j++) { cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j]; } n++; } } while (n > 1 && point_same(p[n - 1], p[0])) n--; } __device__ inline double intersectArea(Point a, Point b, Point c, Point d, double* grad_AB, int order, int convex_n) { Point o(0, 0); int res_flag = 0; int s1 = sig(cross(o, a, b)); int s2 = sig(cross(o, c, d)); if (s1 == 0 || s2 == 0) return 0.0; if (s1 == -1) { Point* i = &a; Point* j = &b; swap1(i, j); res_flag = 1; } if (s2 == -1) { Point* i = &c; Point* j = &d; swap1(i, j); } Point p[10] = {o, a, b}; int n = 3, n0 = 3, n1, n2, n3; double cut_grad1[MAXN] = {}; double cut_grad2[MAXN] = {}; double cut_grad3[MAXN] = {}; double p1_p_grad[10][10] = {}; double p2_p1_grad[10][10] = {}; double p3_p2_grad[10][10] = {}; double p3_p1_grad[10][10] = {}; double p3_p_grad[10][10] = {}; // 1 polygon_cut(p, n, o, c, cut_grad1); n1 = n; for (int i = 0; i < n; i++) { for (int j = 0; j < 4 * n0; j++) { if (!(j % 2)) { p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j]; } else { p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j]; } } } // 2 polygon_cut(p, n, c, d, cut_grad2); n2 = n; for (int i = 0; i < n; i++) { for (int j = 0; j < 4 * n1; j++) { if (!(j % 2)) { p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j]; } else { p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j]; } } } // 3 polygon_cut(p, n, d, o, cut_grad3); n3 = n; for (int i = 0; i < n; i++) { for (int j = 0; j < 4 * n2; j++) { if (!(j % 2)) { p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j]; } else { p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j]; } } } // mul // p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1) for (int i = 0; i < 2 * n3; i++) { for (int j = 0; j < 2 * n1; j++) { double sum = 0.0; for (int m = 0; m < 2 * n2; m++) { sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j]; } p3_p1_grad[i][j] = sum; } } // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0) for (int i = 0; i < 2 * n3; i++) { for (int j = 0; j < 2 * n0; j++) { double sum = 0.0; for (int m = 0; m < 2 * n1; m++) { sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j]; } p3_p_grad[i][j] = sum; } } // calculate S_grad int polygon_index_box_index[20]; double grad_polygon[20]; double S_grad[6]; for (int i = 0; i < n3; i++) { polygon_index_box_index[i] = i; polygon_index_box_index[i + n3] = i; } double res = polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon); if (s1 * s2 == -1) { for (int j = 0; j < 2 * 3; j++) { double sum = 0.0; for (int m = 0; m < 2 * n3; m++) { sum = sum - grad_polygon[m] * p3_p_grad[m][j]; } S_grad[j] = sum; } if (order != convex_n - 1) { if (res_flag) { grad_AB[2 * order] += S_grad[4]; grad_AB[2 * order + 1] += S_grad[5]; grad_AB[2 * order + 2] += S_grad[2]; grad_AB[2 * order + 3] += S_grad[3]; } else { grad_AB[2 * order] += S_grad[2]; grad_AB[2 * order + 1] += S_grad[3]; grad_AB[2 * order + 2] += S_grad[4]; grad_AB[2 * order + 3] += S_grad[5]; } } else { if (res_flag) { grad_AB[2 * order] += S_grad[4]; grad_AB[2 * order + 1] += S_grad[5]; grad_AB[0] += S_grad[2]; grad_AB[1] += S_grad[3]; } else { grad_AB[2 * order] += S_grad[2]; grad_AB[2 * order + 1] += S_grad[3]; grad_AB[0] += S_grad[4]; grad_AB[1] += S_grad[5]; } } res = -res; } else { for (int j = 0; j < 2 * 3; j++) { double sum = 0.0; for (int m = 0; m < 2 * n3; m++) { sum = sum + grad_polygon[m] * p3_p_grad[m][j]; } S_grad[j] = sum; } if (order != convex_n - 1) { if (res_flag) { grad_AB[2 * order] += S_grad[4]; grad_AB[2 * order + 1] += S_grad[5]; grad_AB[2 * order + 2] += S_grad[2]; grad_AB[2 * order + 3] += S_grad[3]; } else { grad_AB[2 * order] += S_grad[2]; grad_AB[2 * order + 1] += S_grad[3]; grad_AB[2 * order + 2] += S_grad[4]; grad_AB[2 * order + 3] += S_grad[5]; } } else { if (res_flag) { grad_AB[2 * order] += S_grad[4]; grad_AB[2 * order + 1] += S_grad[5]; grad_AB[0] += S_grad[2]; grad_AB[1] += S_grad[3]; } else { grad_AB[2 * order] += S_grad[2]; grad_AB[2 * order + 1] += S_grad[3]; grad_AB[0] += S_grad[4]; grad_AB[1] += S_grad[5]; } } } return res; } __device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2, double* grad_AB) { if (area(ps1, n1) < 0) reverse1(ps1, n1); if (area(ps2, n2) < 0) reverse1(ps2, n2); ps1[n1] = ps1[0]; ps2[n2] = ps2[0]; double res = 0; for (int i = 0; i < n1; i++) { for (int j = 0; j < n2; j++) { res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1); } } return res; } __device__ inline void Jarvis(Point* in_poly, int& n_poly) { Point p_max, p_k; int max_index, k_index; int Stack[NMAX] = {}, top1, top2; double sign; Point right_point[10], left_point[10]; for (int i = 0; i < n_poly; i++) { if (in_poly[i].y < in_poly[0].y || in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { Point* j = &(in_poly[0]); Point* k = &(in_poly[i]); swap1(j, k); } if (i == 0) { p_max = in_poly[0]; max_index = 0; } if (in_poly[i].y > p_max.y || in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { p_max = in_poly[i]; max_index = i; } } if (max_index == 0) { max_index = 1; p_max = in_poly[max_index]; } k_index = 0, Stack[0] = 0, top1 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > dis(in_poly[Stack[top1]], p_k)))) { p_k = in_poly[i]; k_index = i; } } top1++; Stack[top1] = k_index; } for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]]; k_index = 0, Stack[0] = 0, top2 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > dis(in_poly[Stack[top2]], p_k))) { p_k = in_poly[i]; k_index = i; } } top2++; Stack[top2] = k_index; } for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]]; for (int i = 0; i < top1 + top2; i++) { if (i <= top1) { in_poly[i] = right_point[i]; } else { in_poly[i] = left_point[top2 - (i - top1)]; } } n_poly = top1 + top2; } __device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2, int n2, double* grad_C) { Point polygon[MAXN]; int n = n1 + n2, n_poly = 0; for (int i = 0; i < n1; i++) { for (int j = 0; j < n - n1; j++) { if (point_same(ps1[i], ps2[j])) { for (int k = j; k < n - n1 - 1; k++) { ps2[k] = ps2[k + 1]; } n2--; break; } } } n_poly = n1 + n2; for (int i = 0; i < n_poly; i++) { if (i < n1) { polygon[i] = ps1[i]; } else { polygon[i] = ps2[i - n1]; } } Jarvis(polygon, n_poly); int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; int n_pred = 0; for (int i = 0; i < n_poly; i++) { for (int j = 0; j < n1; j++) { if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) { polygon_to_pred_index[n_pred] = i; polygon_to_pred_index[n_pred + n1] = j; n_pred += 1; break; } } } if (n_pred == 0) { double polygon_area = fabs(area(polygon, n_poly)); for (int i = 0; i < 18; i++) { grad_C[i] = 0.0; } return polygon_area; } else { double polygon_area = polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C); if (polygon_area < 0) { for (int i = 0; i < 18; i++) { grad_C[i] = -grad_C[i]; } } return fabs(polygon_area); } } // convex_find and get the polygon_index_box_index __device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly, int* points_to_convex_ind) { int n_input = n_poly; Point input_poly[20]; for (int i = 0; i < n_input; i++) { input_poly[i].x = in_poly[i].x; input_poly[i].y = in_poly[i].y; } Point p_max, p_k; int max_index, k_index; int Stack[20], top1, top2; double sign; Point right_point[10], left_point[10]; for (int i = 0; i < n_poly; i++) { if (in_poly[i].y < in_poly[0].y || in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { Point* j = &(in_poly[0]); Point* k = &(in_poly[i]); swap1(j, k); } if (i == 0) { p_max = in_poly[0]; max_index = 0; } if (in_poly[i].y > p_max.y || in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { p_max = in_poly[i]; max_index = i; } } if (max_index == 0) { max_index = 1; p_max = in_poly[max_index]; } k_index = 0, Stack[0] = 0, top1 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > dis(in_poly[Stack[top1]], p_k)))) { p_k = in_poly[i]; k_index = i; } } top1++; Stack[top1] = k_index; } for (int i = 0; i <= top1; i++) { right_point[i] = in_poly[Stack[i]]; } k_index = 0, Stack[0] = 0, top2 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > dis(in_poly[Stack[top2]], p_k))) { p_k = in_poly[i]; k_index = i; } } top2++; Stack[top2] = k_index; } for (int i = top2 - 1; i >= 0; i--) { left_point[i] = in_poly[Stack[i]]; } for (int i = 0; i < top1 + top2; i++) { if (i <= top1) { in_poly[i] = right_point[i]; } else { in_poly[i] = left_point[top2 - (i - top1)]; } } n_poly = top1 + top2; for (int i = 0; i < n_poly; i++) { for (int j = 0; j < n_input; j++) { if (point_same(in_poly[i], input_poly[j])) { points_to_convex_ind[i] = j; break; } } } } template __device__ inline float devrIoU(T const* const p, T const* const q, T* point_grad, const int idx) { Point ps1[MAXN], ps2[MAXN]; Point convex[MAXN]; for (int i = 0; i < 9; i++) { convex[i].x = (double)p[i * 2]; convex[i].y = (double)p[i * 2 + 1]; } int n_convex = 9; int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; Jarvis_and_index(convex, n_convex, points_to_convex_ind); int n1 = n_convex; int n2 = 4; for (int i = 0; i < n1; i++) { ps1[i].x = (double)convex[i].x; ps1[i].y = (double)convex[i].y; } for (int i = 0; i < n2; i++) { ps2[i].x = (double)q[i * 2]; ps2[i].y = (double)q[i * 2 + 1]; } int polygon_index_box_index[18]; for (int i = 0; i < n1; i++) { polygon_index_box_index[i] = i; polygon_index_box_index[i + n1] = i; } double grad_A[18] = {}; double grad_AB[18] = {}; double grad_C[18] = {}; double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB); double S_pred = polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A); if (S_pred < 0) { for (int i = 0; i < n_convex * 2; i++) { grad_A[i] = -grad_A[i]; } } double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area; double iou = inter_area / union_area; double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C); // printf("%d:live\n", idx); double rot_giou = iou - (polygon_area - union_area) / polygon_area; float grad_point_temp[18] = {}; for (int i = 0; i < n_convex; i++) { int grad_point = points_to_convex_ind[i]; grad_point_temp[2 * grad_point] = (float)((union_area + inter_area) / (union_area * union_area) * grad_AB[2 * i] - iou / union_area * grad_A[2 * i] - 1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) - (union_area) / polygon_area / polygon_area * grad_C[2 * i]); grad_point_temp[2 * grad_point + 1] = (float)((union_area + inter_area) / (union_area * union_area) * grad_AB[2 * i + 1] - iou / union_area * grad_A[2 * i + 1] - 1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) - (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]); } for (int i = 0; i < 9; i++) { point_grad[2 * i] = grad_point_temp[2 * i]; point_grad[2 * i + 1] = grad_point_temp[2 * i + 1]; } return (float)rot_giou; } template __global__ void convex_giou_musa_kernel(const int ex_n_boxes, const int gt_n_boxes, const T* ex_boxes, const T* gt_boxes, T* point_grad) { MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) { const T* cur_box = ex_boxes + index * 18; const T* cur_gt_box = gt_boxes + index * 8; T* cur_grad = point_grad + index * 19; T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x); cur_grad[18] = giou; } } __device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) { double s1, s2; s1 = cross(a, b, c); s2 = cross(a, b, d); if (sig(s1) == 0 && sig(s2) == 0) return 2; if (sig(s2 - s1) == 0) return 0; p.x = (c.x * s2 - d.x * s1) / (s2 - s1); p.y = (c.y * s2 - d.y * s1) / (s2 - s1); return 1; } __device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) { Point pp[MAXN]; int m = 0; p[n] = p[0]; for (int i = 0; i < n; i++) { if (sig(cross(a, b, p[i])) > 0) { pp[m] = p[i]; m++; } if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) { lineCross(a, b, p[i], p[i + 1], pp[m]); m++; } } n = 0; for (int i = 0; i < m; i++) { if (!i || !(point_same(pp[i], pp[i - 1]))) { p[n] = pp[i]; n++; } } while (n > 1 && point_same(p[n - 1], p[0])) n--; } __device__ inline double intersectArea(Point a, Point b, Point c, Point d) { Point o(0, 0); int s1 = sig(cross(o, a, b)); int s2 = sig(cross(o, c, d)); if (s1 == 0 || s2 == 0) return 0.0; if (s1 == -1) { Point* i = &a; Point* j = &b; swap1(i, j); } if (s2 == -1) { Point* i = &c; Point* j = &d; swap1(i, j); } Point p[10] = {o, a, b}; int n = 3; polygon_cut(p, n, o, c); polygon_cut(p, n, c, d); polygon_cut(p, n, d, o); double res = area(p, n); if (s1 * s2 == -1) res = -res; return res; } __device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2) { if (area(ps1, n1) < 0) reverse1(ps1, n1); if (area(ps2, n2) < 0) reverse1(ps2, n2); ps1[n1] = ps1[0]; ps2[n2] = ps2[0]; double res = 0; for (int i = 0; i < n1; i++) { for (int j = 0; j < n2; j++) { res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]); } } return res; } template __device__ inline float devrIoU(T const* const p, T const* const q) { Point ps1[MAXN], ps2[MAXN]; Point convex[MAXN]; for (int i = 0; i < 9; i++) { convex[i].x = (double)p[i * 2]; convex[i].y = (double)p[i * 2 + 1]; } int n_convex = 9; int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; Jarvis_and_index(convex, n_convex, points_to_convex_ind); int n1 = n_convex; for (int i = 0; i < n1; i++) { ps1[i].x = (double)convex[i].x; ps1[i].y = (double)convex[i].y; } int n2 = 4; for (int i = 0; i < n2; i++) { ps2[i].x = (double)q[i * 2]; ps2[i].y = (double)q[i * 2 + 1]; } double inter_area = intersectAreaO(ps1, n1, ps2, n2); double S_pred = area(ps1, n1); double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area; double iou = inter_area / union_area; return (float)iou; } template __global__ void convex_iou_musa_kernel(const int ex_n_boxes, const int gt_n_boxes, const T* ex_boxes, const T* gt_boxes, T* iou) { MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) { const T* cur_box = ex_boxes + index * 18; for (int i = 0; i < gt_n_boxes; i++) { iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8); } } } #endif // CONVEX_IOU_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/correlation_musa.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu // Original licence: Under MIT License #ifndef CORRELATION_MUSA #define CORRELATION_MUSA #include "pytorch_musa_helper.hpp" #include #include // Using is recommended in the official documentation in // https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op. // However, we use for compatibility with MUSA 9.0 // Read https://github.com/pytorch/extension-cpp/issues/35 for more details. #include #include #include using namespace torch; #define TensorAcc4R PackedTensorAccessor32 #define TensorAcc5R PackedTensorAccessor32 #define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W) #define WARP_SIZE 32 #define FULL_MASK 0xffffffff template __global__ void correlation_forward_musa_kernel( const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output, int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH, int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW, int oH, int oW) { const int iH = rInput1.size(1); const int iW = rInput1.size(2); const int C = rInput1.size(3); const int n = blockIdx.x; const int h = blockIdx.y * blockDim.y + threadIdx.y; const int w = blockIdx.z * blockDim.z + threadIdx.z; if (h >= oH || w >= oW) return; const int thread = threadIdx.x; const int start_i = -padH + h * dH; const int start_j = -padW + w * dW; const int patchRadH = dilation_patchH * (patchH - 1) / 2; const int patchRadW = dilation_patchW * (patchW - 1) / 2; for (int ph = 0; ph < patchH; ++ph) { int ph_dilated = ph * dilation_patchH - patchRadH; for (int pw = 0; pw < patchW; ++pw) { int pw_dilated = pw * dilation_patchW - patchRadW; scalar_t prod_sum = 0.0f; for (int i = 0; i < kH; ++i) { int i1 = start_i + i * dilationH; int i2 = i1 + ph_dilated; if (WITHIN_BOUNDS(i1, i2, iH, iH)) { for (int j = 0; j < kW; ++j) { int j1 = start_j + j * dilationW; int j2 = j1 + pw_dilated; if (WITHIN_BOUNDS(j1, j2, iW, iW)) { for (int c = thread; c < C; c += WARP_SIZE) { scalar_t v1 = rInput1[n][i1][j1][c]; scalar_t v2 = rInput2[n][i2][j2][c]; prod_sum += v1 * v2; } } } } } // accumulate for (int offset = 16; offset > 0; offset /= 2) #ifdef MMCV_WITH_HIP prod_sum += __shfl_down(float(prod_sum), offset); #else prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset); #endif if (thread == 0) { output[n][ph][pw][h][w] = prod_sum; } } } } template __global__ void correlation_backward_musa_kernel_input1( const TensorAcc5R grad_output, const TensorAcc4R input2, TensorAcc4R grad_input1, const int kH, const int kW, const int patchH, const int patchW, const int padH, const int padW, const int dilationH, const int dilationW, const int dilation_patchH, const int dilation_patchW, const int dH, const int dW) { const int iH = input2.size(1); const int iW = input2.size(2); const int C = input2.size(3); const int H = grad_output.size(3); const int W = grad_output.size(4); const int patchRadH = (patchH - 1) / 2; const int patchRadW = (patchW - 1) / 2; const int n = blockIdx.x; const int h = blockIdx.y; const int w = blockIdx.z; const int h_2 = h + padH; const int w_2 = w + padW; const int min_h = h_2 - kH * dilationH; const int min_w = w_2 - kW * dilationW; extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[]; scalar_t *grad_cache = reinterpret_cast(grad_cache_char); for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) { const int ph = i / patchW; const int pw = i % patchW; int i1 = h + dilation_patchH * (ph - patchRadH); int j1 = w + dilation_patchW * (pw - patchRadW); if (WITHIN_BOUNDS(i1, j1, iH, iW)) { scalar_t grad_val = 0.0f; for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { int i2 = (h_3) / dH; if (i2 * dH != h_3) continue; for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { int j2 = (w_3) / dW; if (j2 * dW != w_3) continue; if (WITHIN_BOUNDS(i2, j2, H, W)) { grad_val += grad_output[n][ph][pw][i2][j2]; } } } grad_cache[i] = grad_val; } } __syncthreads(); for (int c = threadIdx.x; c < C; c += blockDim.x) { scalar_t grad_input_val = 0.0f; for (int ph = 0; ph < patchH; ++ph) { int i1 = h + dilation_patchH * (ph - patchRadH); for (int pw = 0; pw < patchW; ++pw) { int j1 = w + dilation_patchW * (pw - patchRadW); if (WITHIN_BOUNDS(i1, j1, iH, iW)) { grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw]; } } } grad_input1[n][c][h][w] = grad_input_val; } } template __global__ void correlation_backward_musa_kernel_input2( const TensorAcc5R grad_output, const TensorAcc4R input1, TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH, int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW) { const int iH = input1.size(1); const int iW = input1.size(2); const int C = input1.size(3); const int patchRadH = (patchH - 1) / 2; const int patchRadW = (patchW - 1) / 2; const int H = grad_output.size(3); const int W = grad_output.size(4); const int dilatedKH = kH * dilationH; const int dilatedKW = kW * dilationW; const int n = blockIdx.x; const int h = blockIdx.y; const int w = blockIdx.z; extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[]; scalar_t *grad_cache = reinterpret_cast(grad_cache_char); for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) { const int ph = i / patchW; const int pw = i % patchW; int i1 = h - dilation_patchH * (ph - patchRadH); int j1 = w - dilation_patchW * (pw - patchRadW); if (WITHIN_BOUNDS(i1, j1, iH, iW)) { scalar_t grad_val = 0.0f; const int h_2 = i1 + padH; const int w_2 = j1 + padW; const int min_h = h_2 - dilatedKH; const int min_w = w_2 - dilatedKW; for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { int i2 = (h_3) / dH; if (i2 * dH != h_3) continue; for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { int j2 = (w_3) / dW; if (j2 * dW != w_3) continue; if (WITHIN_BOUNDS(i2, j2, H, W)) { grad_val += grad_output[n][ph][pw][i2][j2]; } } } grad_cache[i] = grad_val; } } __syncthreads(); for (int c = threadIdx.x; c < C; c += blockDim.x) { scalar_t grad_input_val = 0.0f; for (int ph = 0; ph < patchH; ++ph) { int i1 = h - dilation_patchH * (ph - patchRadH); for (int pw = 0; pw < patchW; ++pw) { int j1 = w - dilation_patchW * (pw - patchRadW); if (WITHIN_BOUNDS(i1, j1, iH, iW)) { grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw]; } } } grad_input2[n][c][h][w] = grad_input_val; } } #endif ================================================ FILE: mmcv/ops/csrc/common/musa/deform_conv_musa_kernel.muh ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer ***************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, *this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ********************* * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.muh * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng */ // modified from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu #ifndef DEFORM_CONV_MUSA_KERNEL_MUH #define DEFORM_CONV_MUSA_KERNEL_MUH #include #include "pytorch_musa_helper.hpp" template __device__ T deformable_im2col_bilinear(const T *input, const int data_width, const int height, const int width, T h, T w) { if (h <= -1 || height <= h || w <= -1 || width <= w) { return 0; } int h_low = floorf(h); int w_low = floorf(w); int h_high = h_low + 1; int w_high = w_low + 1; T lh = h - h_low; T lw = w - w_low; T hh = 1 - lh, hw = 1 - lw; T v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low]; T v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = input[h_low * data_width + w_high]; T v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = input[h_high * data_width + w_low]; T v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = input[h_high * data_width + w_high]; T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floorf(argmax_h); int argmax_w_low = floorf(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; T weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height, const int width, const T *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floorf(argmax_h); int argmax_w_low = floorf(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; T weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void deformable_im2col_gpu_kernel( const int n, const T *data_im, const T *data_offset, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, T *data_col) { MUSA_1D_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; T *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; const T *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const T *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; T val = static_cast(0); const T h_im = h_in + i * dilation_h + offset_h; const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); *data_col_ptr = val; data_col_ptr += batch_size * height_col * width_col; } } } } template __global__ void deformable_col2im_gpu_kernel( const int n, const T *data_col, const T *data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, T *grad_im) { MUSA_1D_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const T *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; const T cur_inv_h_data = h_in + i * dilation_h + offset_h; const T cur_inv_w_data = w_in + j * dilation_w + offset_w; const T cur_top_grad = data_col[index]; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } template __global__ void deformable_col2im_coord_gpu_kernel( const int n, const T *data_col, const T *data_im, const T *data_offset, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, T *grad_offset) { MUSA_1D_KERNEL_LOOP(index, n) { T val = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const T *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const T *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const T *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; T inv_h = h_in + i * dilation_h + offset_h; T inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) inv_h = inv_w = -2; const T weight = get_coordinate_weight(inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos]; cnt += 1; } grad_offset[index] = val; } } #endif // DEFORM_CONV_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/deform_roi_pool_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef DEFORM_ROI_POOL_MUSA_KERNEL_MUH #define DEFORM_ROI_POOL_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void deform_roi_pool_forward_musa_kernel( const int nthreads, const T* input, const T* rois, const T* offset, T* output, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, const T gamma, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start_w = offset_rois[1] * spatial_scale - 0.5; T roi_start_h = offset_rois[2] * spatial_scale - 0.5; T roi_end_w = offset_rois[3] * spatial_scale - 0.5; T roi_end_h = offset_rois[4] * spatial_scale - 0.5; T roi_width = roi_end_w - roi_start_w; T roi_height = roi_end_h - roi_start_h; T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* offset_input = input + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_height / pooled_height)); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_width / pooled_width)); // Compute roi offset if (offset != NULL) { const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 + ph * pooled_width + pw; T offset_roi_w = gamma * roi_width * offset_cur_w[0]; T offset_roi_h = gamma * roi_height * offset_cur_w[pooled_width * pooled_height]; roi_start_w += offset_roi_w; roi_start_h += offset_roi_h; } // We do average pooling inside a bin const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_input, height, width, y, x, index); output_val += val; } } output[index] = output_val / count; } } template __global__ void deform_roi_pool_backward_musa_kernel( const int nthreads, const T* grad_output, const T* input, const T* rois, const T* offset, T* grad_input, T* grad_offset, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, const T gamma, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; const T* offset_input = input + ((roi_batch_ind * channels + c) * height * width); T* offset_grad_input = grad_input + ((roi_batch_ind * channels + c) * height * width); // Do not using rounding; this implementation detail is critical T roi_start_w = offset_rois[1] * spatial_scale - 0.5; T roi_start_h = offset_rois[2] * spatial_scale - 0.5; T roi_end_w = offset_rois[3] * spatial_scale - 0.5; T roi_end_h = offset_rois[4] * spatial_scale - 0.5; T roi_width = roi_end_w - roi_start_w; T roi_height = roi_end_h - roi_start_h; T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_height / pooled_height)); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_width / pooled_width)); // Compute roi offset if (offset != NULL) { const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 + ph * pooled_width + pw; T offset_roi_w = gamma * roi_width * offset_cur_w[0]; T offset_roi_h = gamma * roi_height * offset_cur_w[pooled_width * pooled_height]; roi_start_w += offset_roi_w; roi_start_h += offset_roi_h; } // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 const T grad_output_this_bin = grad_output[index] / count; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_grad_input + y_low * width + x_low, grad_output_this_bin * w1); atomicAdd(offset_grad_input + y_low * width + x_high, grad_output_this_bin * w2); atomicAdd(offset_grad_input + y_high * width + x_low, grad_output_this_bin * w3); atomicAdd(offset_grad_input + y_high * width + x_high, grad_output_this_bin * w4); if (offset != NULL) { T input_00 = offset_input[y_low * width + x_low]; T input_10 = offset_input[y_low * width + x_high]; T input_01 = offset_input[y_high * width + x_low]; T input_11 = offset_input[y_high * width + x_high]; T ogx = gamma * roi_width * grad_output_this_bin * (input_11 * (y - y_low) + input_10 * (y_high - y) + input_01 * (y_low - y) + input_00 * (y - y_high)); T ogy = gamma * roi_height * grad_output_this_bin * (input_11 * (x - x_low) + input_01 * (x_high - x) + input_10 * (x_low - x) + input_00 * (x - x_high)); atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 + ph * pooled_width + pw, ogx); atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 + pooled_width * pooled_height + ph * pooled_width + pw, ogy); } } } } } } #endif // DEFORM_ROI_POOL_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/diff_iou_rotated_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Adapted from // https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa #include "pytorch_musa_helper.hpp" #define MAX_NUM_VERT_IDX 9 #define INTERSECTION_OFFSET 8 #define EPSILON 1e-8 inline int opt_n_thread(int work_size) { const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); return max(min(1 << pow_2, THREADS_PER_BLOCK), 1); } /* compare normalized vertices (vertices around (0,0)) if vertex1 < vertex2 return true. order: minimum at x-aixs, become larger in anti-clockwise direction */ __device__ bool compare_vertices(float x1, float y1, float x2, float y2) { if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON) return false; // if equal, return false if (y1 > 0 && y2 < 0) return true; if (y1 < 0 && y2 > 0) return false; float n1 = x1 * x1 + y1 * y1 + EPSILON; float n2 = x2 * x2 + y2 * y2 + EPSILON; float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2; if (y1 > 0 && y2 > 0) { if (diff > EPSILON) return true; else return false; } if (y1 < 0 && y2 < 0) { if (diff < EPSILON) return true; else return false; } return false; } __global__ void diff_iou_rotated_sort_vertices_forward_musa_kernel( int b, int n, int m, const float *__restrict__ vertices, const bool *__restrict__ mask, const int *__restrict__ num_valid, int *__restrict__ idx) { int batch_idx = blockIdx.x; vertices += batch_idx * n * m * 2; mask += batch_idx * n * m; num_valid += batch_idx * n; idx += batch_idx * n * MAX_NUM_VERT_IDX; int index = threadIdx.x; // index of polygon int stride = blockDim.x; for (int i = index; i < n; i += stride) { int pad; // index of arbitrary invalid intersection point (not box corner!) for (int j = INTERSECTION_OFFSET; j < m; ++j) { if (!mask[i * m + j]) { pad = j; break; } } if (num_valid[i] < 3) { // not enough vertices, take an invalid intersection point // (zero padding) for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) { idx[i * MAX_NUM_VERT_IDX + j] = pad; } } else { // sort the valid vertices // note the number of valid vertices is known // note: check that num_valid[i] < MAX_NUM_VERT_IDX for (int j = 0; j < num_valid[i]; ++j) { // initialize with a "big" value float x_min = 1; float y_min = -EPSILON; int i_take = 0; int i2; float x2, y2; if (j != 0) { i2 = idx[i * MAX_NUM_VERT_IDX + j - 1]; x2 = vertices[i * m * 2 + i2 * 2 + 0]; y2 = vertices[i * m * 2 + i2 * 2 + 1]; } for (int k = 0; k < m; ++k) { float x = vertices[i * m * 2 + k * 2 + 0]; float y = vertices[i * m * 2 + k * 2 + 1]; if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) { if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) { x_min = x; y_min = y; i_take = k; } } } idx[i * MAX_NUM_VERT_IDX + j] = i_take; } // duplicate the first idx idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0]; // pad zeros for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) { idx[i * MAX_NUM_VERT_IDX + j] = pad; } // for corner case: the two boxes are exactly the same. // in this case, idx would have duplicate elements, which makes the // shoelace formula broken because of the definition, the duplicate // elements only appear in the first 8 positions (they are "corners in // box", not "intersection of edges") if (num_valid[i] == 8) { int counter = 0; for (int j = 0; j < 4; ++j) { int check = idx[i * MAX_NUM_VERT_IDX + j]; for (int k = 4; k < INTERSECTION_OFFSET; ++k) { if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++; } } if (counter == 4) { idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0]; for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) { idx[i * MAX_NUM_VERT_IDX + j] = pad; } } } // TODO: still might need to cover some other corner cases :( } } } ================================================ FILE: mmcv/ops/csrc/common/musa/furthest_point_sample_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH #define FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" __device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, int idx1, int idx2) { const float v1 = dists[idx1], v2 = dists[idx2]; const int i1 = dists_i[idx1], i2 = dists_i[idx2]; dists[idx1] = max(v1, v2); dists_i[idx1] = v2 > v1 ? i2 : i1; } template __global__ void furthest_point_sampling_forward_musa_kernel( int b, int n, int m, const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) { // dataset: (B, N, 3) // tmp: (B, N) // output: // idx: (B, M) if (m <= 0) return; __shared__ float dists[block_size]; __shared__ int dists_i[block_size]; int batch_index = blockIdx.x; dataset += batch_index * n * 3; temp += batch_index * n; idxs += batch_index * m; int tid = threadIdx.x; const int stride = block_size; int old = 0; if (threadIdx.x == 0) idxs[0] = old; __syncthreads(); for (int j = 1; j < m; j++) { int besti = 0; float best = -1; float x1 = dataset[old * 3 + 0]; float y1 = dataset[old * 3 + 1]; float z1 = dataset[old * 3 + 2]; for (int k = tid; k < n; k += stride) { float x2, y2, z2; x2 = dataset[k * 3 + 0]; y2 = dataset[k * 3 + 1]; z2 = dataset[k * 3 + 2]; // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); // if (mag <= 1e-3) // continue; float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); float d2 = min(d, temp[k]); temp[k] = d2; besti = d2 > best ? k : besti; best = d2 > best ? d2 : best; } dists[tid] = best; dists_i[tid] = besti; __syncthreads(); #pragma unroll for (int block_size_thres = 1024; block_size_thres >= 2; block_size_thres >>= 1) { const int tid_thres = block_size_thres / 2; if (block_size >= block_size_thres && tid < tid_thres) { __update(dists, dists_i, tid, tid + tid_thres); } __syncthreads(); } old = dists_i[0]; if (tid == 0) idxs[j] = old; } } // Modified from // https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu template __global__ void furthest_point_sampling_with_dist_forward_musa_kernel( int b, int n, int m, const float *__restrict__ dataset, float *__restrict__ temp, int *__restrict__ idxs) { // dataset: (B, N, N) // tmp: (B, N) // output: // idx: (B, M) if (m <= 0) return; __shared__ float dists[block_size]; __shared__ int dists_i[block_size]; int batch_index = blockIdx.x; dataset += batch_index * n * n; temp += batch_index * n; idxs += batch_index * m; int tid = threadIdx.x; const int stride = block_size; int old = 0; if (threadIdx.x == 0) idxs[0] = old; __syncthreads(); for (int j = 1; j < m; j++) { int besti = 0; float best = -1; // float x1 = dataset[old * 3 + 0]; // float y1 = dataset[old * 3 + 1]; // float z1 = dataset[old * 3 + 2]; for (int k = tid; k < n; k += stride) { // float x2, y2, z2; // x2 = dataset[k * 3 + 0]; // y2 = dataset[k * 3 + 1]; // z2 = dataset[k * 3 + 2]; // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * // (z2 - z1); float d = dataset[old * n + k]; float d2 = min(d, temp[k]); temp[k] = d2; besti = d2 > best ? k : besti; best = d2 > best ? d2 : best; } dists[tid] = best; dists_i[tid] = besti; __syncthreads(); #pragma unroll for (int block_size_thres = 1024; block_size_thres >= 2; block_size_thres >>= 1) { const int tid_thres = block_size_thres / 2; if (block_size >= block_size_thres && tid < tid_thres) { __update(dists, dists_i, tid, tid + tid_thres); } __syncthreads(); } old = dists_i[0]; if (tid == 0) idxs[j] = old; } } #endif // FURTHEST_POINT_SAMPLE_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/gather_points_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef GATHER_POINTS_MUSA_KERNEL_MUH #define GATHER_POINTS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" #define TOTAL_THREADS 1024 template __global__ void gather_points_forward_musa_kernel(int b, int c, int n, int m, const T *points, const int *__restrict__ idx, T *out) { // points: (B, C, N) // idx: (B, M) // output: // out: (B, C, M) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, m) { if (bs_idx >= b || c_idx >= c) return; out += bs_idx * c * m + c_idx * m + pt_idx; idx += bs_idx * m + pt_idx; points += bs_idx * c * n + c_idx * n; out[0] = points[idx[0]]; } } template __global__ void gather_points_backward_musa_kernel(int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx, T *grad_points) { // grad_out: (B, C, M) // idx: (B, M) // output: // grad_points: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, m) { if (bs_idx >= b || c_idx >= c) return; grad_out += bs_idx * c * m + c_idx * m + pt_idx; idx += bs_idx * m + pt_idx; grad_points += bs_idx * c * n + c_idx * n; atomicAdd(grad_points + idx[0], grad_out[0]); } } #endif // GATHER_POINTS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/group_points_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu #ifndef GROUP_POINTS_MUSA_KERNEL_MUH #define GROUP_POINTS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void group_points_forward_musa_kernel(int b, int c, int n, int npoints, int nsample, const T *points, const int *__restrict__ idx, T *out) { // points: (B, C, N) // idx: (B, npoints, nsample) // output: // out: (B, C, npoints, nsample) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(index, npoints * nsample) { if (bs_idx >= b || c_idx >= c) return; int pt_idx = index / nsample; int sample_idx = index % nsample; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; int in_idx = bs_idx * c * n + c_idx * n + idx[0]; int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; out[out_idx] = points[in_idx]; } } template __global__ void group_points_backward_musa_kernel(int b, int c, int n, int npoints, int nsample, const T *grad_out, const int *__restrict__ idx, T *grad_points) { // grad_out: (B, C, npoints, nsample) // idx: (B, npoints, nsample) // output: // grad_points: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(index, npoints * nsample) { int pt_idx = index / nsample; if (bs_idx >= b || c_idx >= c) return; int sample_idx = index % nsample; grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + pt_idx * nsample + sample_idx; idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]); } } #endif // GROUP_POINTS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/iou3d_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef IOU3D_MUSA_KERNEL_MUH #define IOU3D_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" const int THREADS_PER_BLOCK_IOU3D = 16; const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8; __device__ const float EPS = 1e-8; struct Point { float x, y; __device__ Point() {} __device__ Point(double _x, double _y) { x = _x, y = _y; } __device__ void set(float _x, float _y) { x = _x; y = _y; } __device__ Point operator+(const Point &b) const { return Point(x + b.x, y + b.y); } __device__ Point operator-(const Point &b) const { return Point(x - b.x, y - b.y); } }; __device__ inline float cross(const Point &a, const Point &b) { return a.x * b.y - a.y * b.x; } __device__ inline float cross(const Point &p1, const Point &p2, const Point &p0) { return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y); } __device__ int check_rect_cross(const Point &p1, const Point &p2, const Point &q1, const Point &q2) { int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) && min(q1.x, q2.x) <= max(p1.x, p2.x) && min(p1.y, p2.y) <= max(q1.y, q2.y) && min(q1.y, q2.y) <= max(p1.y, p2.y); return ret; } __device__ inline int check_in_box2d(const float *box, const Point &p) { // params: box (7) [x, y, z, dx, dy, dz, heading] const float MARGIN = 1e-2; float center_x = box[0], center_y = box[1]; // rotate the point in the opposite direction of box float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]); float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN); } __device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans_point) { // fast exclusion if (check_rect_cross(p0, p1, q0, q1) == 0) return 0; // check cross standing float s1 = cross(q0, p1, p0); float s2 = cross(p1, q1, p0); float s3 = cross(p0, q1, q0); float s4 = cross(q1, p1, q0); if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0; // calculate intersection of two lines float s5 = cross(q1, p1, p0); if (fabs(s5 - s1) > EPS) { ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1); ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1); } else { float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y; float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y; float D = a0 * b1 - a1 * b0; ans_point.x = (b0 * c1 - b1 * c0) / D; ans_point.y = (a1 * c0 - a0 * c1) / D; } return 1; } __device__ inline void rotate_around_center(const Point ¢er, const float angle_cos, const float angle_sin, Point &p) { float new_x = (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x; float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y; p.set(new_x, new_y); } __device__ inline int point_cmp(const Point &a, const Point &b, const Point ¢er) { return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x); } __device__ inline float box_overlap(const float *box_a, const float *box_b) { // params box_a: [x, y, z, dx, dy, dz, heading] // params box_b: [x, y, z, dx, dy, dz, heading] float a_angle = box_a[6], b_angle = box_b[6]; float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2; float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half; float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half; float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half; float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half; Point center_a(box_a[0], box_a[1]); Point center_b(box_b[0], box_b[1]); Point box_a_corners[5]; box_a_corners[0].set(a_x1, a_y1); box_a_corners[1].set(a_x2, a_y1); box_a_corners[2].set(a_x2, a_y2); box_a_corners[3].set(a_x1, a_y2); Point box_b_corners[5]; box_b_corners[0].set(b_x1, b_y1); box_b_corners[1].set(b_x2, b_y1); box_b_corners[2].set(b_x2, b_y2); box_b_corners[3].set(b_x1, b_y2); // get oriented corners float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle); float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle); for (int k = 0; k < 4; k++) { rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]); rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]); } box_a_corners[4] = box_a_corners[0]; box_b_corners[4] = box_b_corners[0]; // get intersection of lines Point cross_points[16]; Point poly_center; int cnt = 0, flag = 0; poly_center.set(0, 0); for (int i = 0; i < 4; i++) { for (int j = 0; j < 4; j++) { flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]); if (flag) { poly_center = poly_center + cross_points[cnt]; cnt++; } } } // check corners for (int k = 0; k < 4; k++) { if (check_in_box2d(box_a, box_b_corners[k])) { poly_center = poly_center + box_b_corners[k]; cross_points[cnt] = box_b_corners[k]; cnt++; } if (check_in_box2d(box_b, box_a_corners[k])) { poly_center = poly_center + box_a_corners[k]; cross_points[cnt] = box_a_corners[k]; cnt++; } } poly_center.x /= cnt; poly_center.y /= cnt; // sort the points of polygon Point temp; for (int j = 0; j < cnt - 1; j++) { for (int i = 0; i < cnt - j - 1; i++) { if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) { temp = cross_points[i]; cross_points[i] = cross_points[i + 1]; cross_points[i + 1] = temp; } } } // get the overlap areas float area = 0; for (int k = 0; k < cnt - 1; k++) { area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]); } return fabs(area) / 2.0; } __device__ inline float iou_bev(const float *box_a, const float *box_b) { // params box_a: [x, y, z, dx, dy, dz, heading] // params box_b: [x, y, z, dx, dy, dz, heading] float sa = box_a[3] * box_a[4]; float sb = box_b[3] * box_b[4]; float s_overlap = box_overlap(box_a, box_b); return s_overlap / fmaxf(sa + sb - s_overlap, EPS); } __global__ void iou3d_boxes_overlap_bev_forward_musa_kernel( const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap) { // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] MUSA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) { if (a_idx >= num_a || b_idx >= num_b) { return; } const float *cur_box_a = boxes_a + a_idx * 7; const float *cur_box_b = boxes_b + b_idx * 7; float cur_overlap = box_overlap(cur_box_a, cur_box_b); ans_overlap[a_idx * num_b + b_idx] = cur_overlap; } } __global__ void iou3d_nms3d_forward_musa_kernel(const int boxes_num, const float nms_overlap_thresh, const float *boxes, unsigned long long *mask) { // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int blocks = (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { // if (row_start > col_start) return; const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const float *cur_box = boxes + cur_box_idx * 7; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; mask[cur_box_idx * col_blocks + col_start] = t; } } } __device__ inline float iou_normal(float const *const a, float const *const b) { // params: a: [x, y, z, dx, dy, dz, heading] // params: b: [x, y, z, dx, dy, dz, heading] float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2); float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2); float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); float interS = width * height; float Sa = a[3] * a[4]; float Sb = b[3] * b[4]; return interS / fmaxf(Sa + Sb - interS, EPS); } __global__ void iou3d_nms3d_normal_forward_musa_kernel( const int boxes_num, const float nms_overlap_thresh, const float *boxes, unsigned long long *mask) { // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) const int blocks = (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { // if (row_start > col_start) return; const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS); __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; const float *cur_box = boxes + cur_box_idx * 7; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; mask[cur_box_idx * col_blocks + col_start] = t; } } } #endif // IOU3D_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/knn_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap #ifndef KNN_MUSA_KERNEL_MUH #define KNN_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" inline __device__ void swap_float(float *x, float *y) { float tmp = *x; *x = *y; *y = tmp; } inline __device__ void swap_int(int *x, int *y) { int tmp = *x; *x = *y; *y = tmp; } __device__ void reheap(float *dist, int *idx, int k) { int root = 0; int child = root * 2 + 1; while (child < k) { if (child + 1 < k && dist[child + 1] > dist[child]) child++; if (dist[root] > dist[child]) return; swap_float(&dist[root], &dist[child]); swap_int(&idx[root], &idx[child]); root = child; child = root * 2 + 1; } } __device__ void heap_sort(float *dist, int *idx, int k) { int i; for (i = k - 1; i > 0; i--) { swap_float(&dist[0], &dist[i]); swap_int(&idx[0], &idx[i]); reheap(dist, idx, i); } } // input: xyz (b, n, 3) new_xyz (b, m, 3) // output: idx (b, m, nsample) dist2 (b, m, nsample) template __global__ void knn_forward_musa_kernel(int b, int n, int m, int nsample, const T *xyz, const T *new_xyz, int *__restrict__ idx, T *dist2) { int bs_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, m) { if (bs_idx >= b) return; new_xyz += bs_idx * m * 3 + pt_idx * 3; xyz += bs_idx * n * 3; idx += bs_idx * m * nsample + pt_idx * nsample; dist2 += bs_idx * m * nsample + pt_idx * nsample; T new_x = new_xyz[0]; T new_y = new_xyz[1]; T new_z = new_xyz[2]; float best_dist[100]; int best_idx[100]; for (int i = 0; i < nsample; i++) { best_dist[i] = 1e10; best_idx[i] = 0; } for (int i = 0; i < n; i++) { T x = xyz[i * 3 + 0]; T y = xyz[i * 3 + 1]; T z = xyz[i * 3 + 2]; T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < best_dist[0]) { best_dist[0] = d2; best_idx[0] = i; reheap(best_dist, best_idx, nsample); } } heap_sort(best_dist, best_idx, nsample); for (int i = 0; i < nsample; i++) { idx[i] = best_idx[i]; dist2[i] = best_dist[i]; } } } #endif // KNN_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/masked_conv2d_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef MASKED_CONV2D_MUSA_KERNEL_MUH #define MASKED_CONV2D_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void MaskedIm2colForward(const int n, const scalar_t *data_im, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int64_t *mask_h_idx, const int64_t *mask_w_idx, const int mask_cnt, scalar_t *data_col) { // mask_cnt * channels MUSA_1D_KERNEL_LOOP(index, n) { const int m_index = index % mask_cnt; const int h_col = mask_h_idx[m_index]; const int w_col = mask_w_idx[m_index]; const int c_im = index / mask_cnt; const int c_col = c_im * kernel_h * kernel_w; const int h_offset = h_col - pad_h; const int w_offset = w_col - pad_w; scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index; for (int i = 0; i < kernel_h; ++i) { int h_im = h_offset + i; for (int j = 0; j < kernel_w; ++j) { int w_im = w_offset + j; if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { *data_col_ptr = (scalar_t)data_im[(c_im * height + h_im) * width + w_im]; } else { *data_col_ptr = 0.0; } data_col_ptr += mask_cnt; } } } } template __global__ void MaskedCol2imForward(const int n, const scalar_t *data_col, const int height, const int width, const int channels, const int64_t *mask_h_idx, const int64_t *mask_w_idx, const int mask_cnt, scalar_t *data_im) { MUSA_1D_KERNEL_LOOP(index, n) { const int m_index = index % mask_cnt; const int h_im = mask_h_idx[m_index]; const int w_im = mask_w_idx[m_index]; const int c_im = index / mask_cnt; // compute the start and end of the output data_im[(c_im * height + h_im) * width + w_im] = data_col[index]; } } #endif // MASKED_CONV2D_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/min_area_polygons_musa.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef MIN_AREA_POLYGONS_MUSA_KERNEL_MUH #define MIN_AREA_POLYGONS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" #define MAXN 20 __device__ const float PI = 3.1415926; struct Point { float x, y; __device__ Point() {} __device__ Point(float x, float y) : x(x), y(y) {} }; __device__ inline void swap1(Point *a, Point *b) { Point temp; temp.x = a->x; temp.y = a->y; a->x = b->x; a->y = b->y; b->x = temp.x; b->y = temp.y; } __device__ inline float cross(Point o, Point a, Point b) { return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y); } __device__ inline float dis(Point a, Point b) { return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y); } __device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) { float convex_points[2][MAXN]; for (int j = 0; j < n_points; j++) { convex_points[0][j] = ps[j].x; } for (int j = 0; j < n_points; j++) { convex_points[1][j] = ps[j].y; } Point edges[MAXN]; float edges_angles[MAXN]; float unique_angles[MAXN]; int n_edges = n_points - 1; int n_unique = 0; int unique_flag = 0; for (int i = 0; i < n_edges; i++) { edges[i].x = ps[i + 1].x - ps[i].x; edges[i].y = ps[i + 1].y - ps[i].y; } for (int i = 0; i < n_edges; i++) { edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x); if (edges_angles[i] >= 0) { edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2); } else { edges_angles[i] = edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2); } } unique_angles[0] = edges_angles[0]; n_unique += 1; for (int i = 1; i < n_edges; i++) { for (int j = 0; j < n_unique; j++) { if (edges_angles[i] == unique_angles[j]) { unique_flag += 1; } } if (unique_flag == 0) { unique_angles[n_unique] = edges_angles[i]; n_unique += 1; unique_flag = 0; } else { unique_flag = 0; } } float minarea = 1e12; for (int i = 0; i < n_unique; i++) { float R[2][2]; float rot_points[2][MAXN]; R[0][0] = cos(unique_angles[i]); R[0][1] = sin(unique_angles[i]); R[1][0] = -sin(unique_angles[i]); R[1][1] = cos(unique_angles[i]); // R x Points for (int m = 0; m < 2; m++) { for (int n = 0; n < n_points; n++) { float sum = 0.0; for (int k = 0; k < 2; k++) { sum = sum + R[m][k] * convex_points[k][n]; } rot_points[m][n] = sum; } } // xmin; float xmin, ymin, xmax, ymax; xmin = 1e12; for (int j = 0; j < n_points; j++) { if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) { continue; } else { if (rot_points[0][j] < xmin) { xmin = rot_points[0][j]; } } } // ymin ymin = 1e12; for (int j = 0; j < n_points; j++) { if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) { continue; } else { if (rot_points[1][j] < ymin) { ymin = rot_points[1][j]; } } } // xmax xmax = -1e12; for (int j = 0; j < n_points; j++) { if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) { continue; } else { if (rot_points[0][j] > xmax) { xmax = rot_points[0][j]; } } } // ymax ymax = -1e12; for (int j = 0; j < n_points; j++) { if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) { continue; } else { if (rot_points[1][j] > ymax) { ymax = rot_points[1][j]; } } } float area = (xmax - xmin) * (ymax - ymin); if (area < minarea) { minarea = area; minbox[0] = unique_angles[i]; minbox[1] = xmin; minbox[2] = ymin; minbox[3] = xmax; minbox[4] = ymax; } } } // convex_find __device__ inline void Jarvis(Point *in_poly, int &n_poly) { int n_input = n_poly; Point input_poly[20]; for (int i = 0; i < n_input; i++) { input_poly[i].x = in_poly[i].x; input_poly[i].y = in_poly[i].y; } Point p_max, p_k; int max_index, k_index; int Stack[20], top1, top2; // float sign; double sign; Point right_point[10], left_point[10]; for (int i = 0; i < n_poly; i++) { if (in_poly[i].y < in_poly[0].y || in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { Point *j = &(in_poly[0]); Point *k = &(in_poly[i]); swap1(j, k); } if (i == 0) { p_max = in_poly[0]; max_index = 0; } if (in_poly[i].y > p_max.y || in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { p_max = in_poly[i]; max_index = i; } } if (max_index == 0) { max_index = 1; p_max = in_poly[max_index]; } k_index = 0, Stack[0] = 0, top1 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > dis(in_poly[Stack[top1]], p_k)))) { p_k = in_poly[i]; k_index = i; } } top1++; Stack[top1] = k_index; } for (int i = 0; i <= top1; i++) { right_point[i] = in_poly[Stack[i]]; } k_index = 0, Stack[0] = 0, top2 = 0; while (k_index != max_index) { p_k = p_max; k_index = max_index; for (int i = 1; i < n_poly; i++) { sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > dis(in_poly[Stack[top2]], p_k))) { p_k = in_poly[i]; k_index = i; } } top2++; Stack[top2] = k_index; } for (int i = top2 - 1; i >= 0; i--) { left_point[i] = in_poly[Stack[i]]; } for (int i = 0; i < top1 + top2; i++) { if (i <= top1) { in_poly[i] = right_point[i]; } else { in_poly[i] = left_point[top2 - (i - top1)]; } } n_poly = top1 + top2; } template __device__ inline void Findminbox(T const *const p, T *minpoints) { Point ps1[MAXN]; Point convex[MAXN]; for (int i = 0; i < 9; i++) { convex[i].x = p[i * 2]; convex[i].y = p[i * 2 + 1]; } int n_convex = 9; Jarvis(convex, n_convex); int n1 = n_convex; for (int i = 0; i < n1; i++) { ps1[i].x = convex[i].x; ps1[i].y = convex[i].y; } ps1[n1].x = convex[0].x; ps1[n1].y = convex[0].y; float minbbox[5] = {0}; minBoundingRect(ps1, n1 + 1, minbbox); float angle = minbbox[0]; float xmin = minbbox[1]; float ymin = minbbox[2]; float xmax = minbbox[3]; float ymax = minbbox[4]; float R[2][2]; R[0][0] = cos(angle); R[0][1] = sin(angle); R[1][0] = -sin(angle); R[1][1] = cos(angle); minpoints[0] = xmax * R[0][0] + ymin * R[1][0]; minpoints[1] = xmax * R[0][1] + ymin * R[1][1]; minpoints[2] = xmin * R[0][0] + ymin * R[1][0]; minpoints[3] = xmin * R[0][1] + ymin * R[1][1]; minpoints[4] = xmin * R[0][0] + ymax * R[1][0]; minpoints[5] = xmin * R[0][1] + ymax * R[1][1]; minpoints[6] = xmax * R[0][0] + ymax * R[1][0]; minpoints[7] = xmax * R[0][1] + ymax * R[1][1]; } template __global__ void min_area_polygons_musa_kernel(const int ex_n_boxes, const T *ex_boxes, T *minbox) { MUSA_1D_KERNEL_LOOP(index, ex_n_boxes) { const T *cur_box = ex_boxes + index * 18; T *cur_min_box = minbox + index * 8; Findminbox(cur_box, cur_min_box); } } #endif // MIN_AREA_POLYGONS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/modulated_deform_conv_musa_kernel.muh ================================================ /*! ******************* BEGIN Caffe Copyright Notice and Disclaimer ***************** * * COPYRIGHT * * All contributions by the University of California: * Copyright (c) 2014-2017 The Regents of the University of California (Regents) * All rights reserved. * * All other contributions: * Copyright (c) 2014-2017, the respective contributors * All rights reserved. * * Caffe uses a shared copyright model: each contributor holds copyright over * their contributions to Caffe. The project versioning records all such * contribution and copyright details. If a contributor wants to further mark * their specific copyright on a particular contribution, they should indicate * their copyright solely in the commit message of the change when it is * committed. * * LICENSE * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, *this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * CONTRIBUTION AGREEMENT * * By contributing to the BVLC/caffe repository through pull-request, comment, * or otherwise, the contributor releases their content to the * license and copyright terms herein. * ***************** END Caffe Copyright Notice and Disclaimer ********************* * * Copyright (c) 2018 Microsoft * Licensed under The MIT License [see LICENSE for details] * \file modulated_deformable_im2col.muh * \brief Function definitions of converting an image to * column matrix based on kernel, padding, dilation, and offset. * These functions are mainly used in deformable convolution operators. * \ref: https://arxiv.org/abs/1703.06211 * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng */ // modified from // https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu #ifndef MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH #define MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH #include #include "pytorch_musa_helper.hpp" template __device__ T dmcn_im2col_bilinear(const T *input, const int data_width, const int height, const int width, T h, T w) { int h_low = floorf(h); int w_low = floorf(w); int h_high = h_low + 1; int w_high = w_low + 1; T lh = h - h_low; T lw = w - w_low; T hh = 1 - lh, hw = 1 - lw; T v1 = 0; if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low]; T v2 = 0; if (h_low >= 0 && w_high <= width - 1) v2 = input[h_low * data_width + w_high]; T v3 = 0; if (h_high <= height - 1 && w_low >= 0) v3 = input[h_high * data_width + w_low]; T v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) v4 = input[h_high * data_width + w_high]; T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h, const int w, const int height, const int width) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floorf(argmax_h); int argmax_w_low = floorf(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; T weight = 0; if (h == argmax_h_low && w == argmax_w_low) weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); if (h == argmax_h_low && w == argmax_w_high) weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); if (h == argmax_h_high && w == argmax_w_low) weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); if (h == argmax_h_high && w == argmax_w_high) weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); return weight; } template __device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w, const int height, const int width, const T *im_data, const int data_width, const int bp_dir) { if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width) { // empty return 0; } int argmax_h_low = floorf(argmax_h); int argmax_w_low = floorf(argmax_w); int argmax_h_high = argmax_h_low + 1; int argmax_w_high = argmax_w_low + 1; T weight = 0; if (bp_dir == 0) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } else if (bp_dir == 1) { if (argmax_h_low >= 0 && argmax_w_low >= 0) weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low]; if (argmax_h_low >= 0 && argmax_w_high <= width - 1) weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high]; if (argmax_h_high <= height - 1 && argmax_w_low >= 0) weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low]; if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high]; } return weight; } template __global__ void modulated_deformable_im2col_gpu_kernel( const int n, const T *data_im, const T *data_offset, const T *data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int num_channels, const int deformable_group, const int height_col, const int width_col, T *data_col) { MUSA_1D_KERNEL_LOOP(index, n) { // index index of output matrix const int w_col = index % width_col; const int h_col = (index / width_col) % height_col; const int b_col = (index / width_col / height_col) % batch_size; const int c_im = (index / width_col / height_col) / batch_size; const int c_col = c_im * kernel_h * kernel_w; // compute deformable group index const int deformable_group_index = c_im / channel_per_deformable_group; const int h_in = h_col * stride_h - pad_h; const int w_in = w_col * stride_w - pad_w; T *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; const T *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width; const T *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const T *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; for (int i = 0; i < kernel_h; ++i) { for (int j = 0; j < kernel_w; ++j) { const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; const T mask = data_mask_ptr[data_mask_hw_ptr]; T val = static_cast(0); const T h_im = h_in + i * dilation_h + offset_h; const T w_im = w_in + j * dilation_w + offset_w; if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im); *data_col_ptr = val * mask; data_col_ptr += batch_size * height_col * width_col; } } } } template __global__ void modulated_deformable_col2im_gpu_kernel( const int n, const T *data_col, const T *data_offset, const T *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int deformable_group, const int height_col, const int width_col, T *grad_im) { MUSA_1D_KERNEL_LOOP(index, n) { const int j = (index / width_col / height_col / batch_size) % kernel_w; const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h; const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h; // compute the start and end of the output const int deformable_group_index = c / channel_per_deformable_group; int w_out = index % width_col; int h_out = (index / width_col) % height_col; int b = (index / width_col / height_col) % batch_size; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const T *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const T *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; const T mask = data_mask_ptr[data_mask_hw_ptr]; const T cur_inv_h_data = h_in + i * dilation_h + offset_h; const T cur_inv_w_data = w_in + j * dilation_w + offset_w; const T cur_top_grad = data_col[index] * mask; const int cur_h = (int)cur_inv_h_data; const int cur_w = (int)cur_inv_w_data; for (int dy = -2; dy <= 2; dy++) { for (int dx = -2; dx <= 2; dx++) { if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && abs(cur_inv_w_data - (cur_w + dx)) < 1) { int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; T weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width); atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); } } } } } template __global__ void modulated_deformable_col2im_coord_gpu_kernel( const int n, const T *data_col, const T *data_im, const T *data_offset, const T *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int batch_size, const int offset_channels, const int deformable_group, const int height_col, const int width_col, T *grad_offset, T *grad_mask) { MUSA_1D_KERNEL_LOOP(index, n) { T val = 0, mval = 0; int w = index % width_col; int h = (index / width_col) % height_col; int c = (index / width_col / height_col) % offset_channels; int b = (index / width_col / height_col) / offset_channels; // compute the start and end of the output const int deformable_group_index = c / (2 * kernel_h * kernel_w); const int col_step = kernel_h * kernel_w; int cnt = 0; const T *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col; const T *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width; const T *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col; const T *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col; const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step) { const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w; const int bp_dir = offset_c % 2; int j = (col_pos / width_col / height_col / batch_size) % kernel_w; int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; int w_out = col_pos % width_col; int h_out = (col_pos / width_col) % height_col; int w_in = w_out * stride_w - pad_w; int h_in = h_out * stride_h - pad_h; const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out); const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); const T offset_h = data_offset_ptr[data_offset_h_ptr]; const T offset_w = data_offset_ptr[data_offset_w_ptr]; const T mask = data_mask_ptr[data_mask_hw_ptr]; T inv_h = h_in + i * dilation_h + offset_h; T inv_w = w_in + j * dilation_w + offset_w; if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) inv_h = inv_w = -2; else mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w); const T weight = dmcn_get_coordinate_weight( inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, width, bp_dir); val += weight * data_col_ptr[col_pos] * mask; cnt += 1; } // KERNEL_ASSIGN(grad_offset[index], offset_req, val); grad_offset[index] = val; if (offset_c % 2 == 0) // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * // height_col + h) * width_col + w], mask_req, mval); grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval; } } #endif // MODULATED_DEFORM_CONV_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/ms_deform_attn_musa_kernel.muh ================================================ /*! ************************************************************************************************** * Deformable DETR * Copyright (c) 2020 SenseTime. All Rights Reserved. * Licensed under the Apache License, Version 2.0 [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #ifndef DEFORM_ATTN_MUSA_KERNEL #define DEFORM_ATTN_MUSA_KERNEL #include "common_musa_helper.hpp" #include "pytorch_musa_helper.hpp" template __device__ scalar_t ms_deform_attn_im2col_bilinear( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c) { const int h_low = floorf(h); const int w_low = floorf(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void ms_deform_attn_col2im_bilinear( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t *&grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { const int h_low = floorf(h); const int w_low = floorf(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value + ptr1, w1 * top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value + ptr2, w2 * top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value + ptr3, w3 * top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value + ptr4, w4 * top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_attn_weight = top_grad * val; *grad_sampling_loc = width * grad_w_weight * top_grad_value; *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; } template __device__ void ms_deform_attn_col2im_bilinear_gm( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &channels, const scalar_t &h, const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad, const scalar_t &attn_weight, scalar_t *&grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { const int h_low = floorf(h); const int w_low = floorf(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const scalar_t lh = h - h_low; const scalar_t lw = w - w_low; const scalar_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * channels + c; const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const scalar_t top_grad_value = top_grad * attn_weight; scalar_t grad_h_weight = 0, grad_w_weight = 0; scalar_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_value + ptr1, w1 * top_grad_value); } scalar_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_value + ptr2, w2 * top_grad_value); } scalar_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_value + ptr3, w3 * top_grad_value); } scalar_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_value + ptr4, w4 * top_grad_value); } const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_attn_weight, top_grad * val); atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); } template __global__ void ms_deformable_im2col_gpu_kernel( const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *data_col) { MUSA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; scalar_t *data_col_ptr = data_col + index; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; scalar_t col = 0; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; const int qid_stride = num_heads * channels; MUSA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w = cache_grad_sampling_loc[0], _grad_h = cache_grad_sampling_loc[1], _grad_a = cache_grad_attn_weight[0]; int sid = 2; for (unsigned int _tid = 1; _tid < blockSize; ++_tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[_tid]; sid += 2; } *grad_sampling_loc_out = _grad_w; *(grad_sampling_loc_out + 1) = _grad_h; *grad_attn_weight_out = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; __shared__ scalar_t cache_grad_attn_weight[blockSize]; unsigned int tid = threadIdx.x; MUSA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_sampling_loc_out = cache_grad_sampling_loc[0]; *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight_out = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; MUSA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); if (tid == 0) { scalar_t _grad_w = cache_grad_sampling_loc[0], _grad_h = cache_grad_sampling_loc[1], _grad_a = cache_grad_attn_weight[0]; int sid = 2; for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; _grad_a += cache_grad_attn_weight[_tid]; sid += 2; } *grad_sampling_loc_out = _grad_w; *(grad_sampling_loc_out + 1) = _grad_h; *grad_attn_weight_out = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; MUSA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_sampling_loc_out = cache_grad_sampling_loc[0]; *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1]; *grad_attn_weight_out = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { extern __shared__ int _s[]; scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; unsigned int tid = threadIdx.x; MUSA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_attn_weight + threadIdx.x) = 0; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, cache_grad_sampling_loc + (threadIdx.x << 1), cache_grad_attn_weight + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]); atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]); atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } template __global__ void ms_deformable_col2im_gpu_kernel_gm( const int n, const scalar_t *grad_col, const scalar_t *data_value, const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight, const int batch_size, const int spatial_size, const int num_heads, const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { MUSA_1D_KERNEL_LOOP(index, n) { int _temp = index; const int c_col = _temp % channels; _temp /= channels; const int sampling_index = _temp; const int m_col = _temp % num_heads; _temp /= num_heads; _temp /= num_query; const int b_col = _temp; const scalar_t top_grad = grad_col[index]; int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; scalar_t *grad_sampling_loc_out = grad_sampling_loc + (grad_sampling_ptr << 1); scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { const int level_start_id = data_level_start_index[l_col]; const int spatial_h_ptr = l_col << 1; const int spatial_h = data_spatial_shapes[spatial_h_ptr]; const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; const scalar_t *data_value_ptr = data_value + value_ptr_offset; scalar_t *grad_value_ptr = grad_value + value_ptr_offset; for (int p_col = 0; p_col < num_point; ++p_col) { const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; const scalar_t weight = data_attn_weight[data_weight_ptr]; const scalar_t h_im = loc_h * spatial_h - 0.5; const scalar_t w_im = loc_w * spatial_w - 0.5; if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, grad_sampling_loc_out, grad_attn_weight_out); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_attn_weight_out += grad_weight_stride; grad_sampling_loc_out += grad_loc_stride; } } } } #endif // DEFORM_ATTN_MUSA_KERNEL ================================================ FILE: mmcv/ops/csrc/common/musa/nms_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef NMS_MUSA_KERNEL_MUH #define NMS_MUSA_KERNEL_MUH #include #include "pytorch_musa_helper.hpp" int const threadsPerBlock = sizeof(unsigned long long int) * 8; __device__ inline bool devIoU(float const *const a, float const *const b, const int offset, const float threshold) { float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); float width = fmaxf(right - left + offset, 0.f), height = fmaxf(bottom - top + offset, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset); float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset); return interS > threshold * (Sa + Sb - interS); } __global__ static void nms_musa(const int n_boxes, const float iou_threshold, const int offset, const float *dev_boxes, unsigned long long *dev_mask) { int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock; MUSA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { const int tid = threadIdx.x; if (row_start > col_start) return; const int row_size = fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 4]; if (tid < col_size) { block_boxes[tid * 4 + 0] = dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0]; block_boxes[tid * 4 + 1] = dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1]; block_boxes[tid * 4 + 2] = dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2]; block_boxes[tid * 4 + 3] = dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3]; } __syncthreads(); if (tid < row_size) { const int cur_box_idx = threadsPerBlock * row_start + tid; const float *cur_box = dev_boxes + cur_box_idx * 4; int i = 0; unsigned long long int t = 0; int start = 0; if (row_start == col_start) { start = tid + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) { t |= 1ULL << i; } } dev_mask[cur_box_idx * gridDim.y + col_start] = t; } } } __global__ static void gather_keep_from_mask(bool *keep, const unsigned long long *dev_mask, const int n_boxes) { const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock; const int tid = threadIdx.x; // mark the bboxes which have been removed. extern __shared__ unsigned long long removed[]; // initialize removed. for (int i = tid; i < col_blocks; i += blockDim.x) { removed[i] = 0; } __syncthreads(); for (int nblock = 0; nblock < col_blocks; ++nblock) { auto removed_val = removed[nblock]; __syncthreads(); const int i_offset = nblock * threadsPerBlock; #pragma unroll for (int inblock = 0; inblock < threadsPerBlock; ++inblock) { const int i = i_offset + inblock; if (i >= n_boxes) break; // select a candidate, check if it should kept. if (!(removed_val & (1ULL << inblock))) { if (tid == 0) { // mark the output. keep[i] = true; } auto p = dev_mask + i * col_blocks; // remove all bboxes which overlap the candidate. for (int j = tid; j < col_blocks; j += blockDim.x) { if (j >= nblock) removed[j] |= p[j]; } __syncthreads(); removed_val = removed[nblock]; } } } } #endif // NMS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/nms_quadri_musa.muh ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved #ifndef NMS_QUADRI_MUSA_MUH #define NMS_QUADRI_MUSA_MUH #include "pytorch_musa_helper.hpp" #include "box_iou_rotated_utils.hpp" __host__ __device__ inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } namespace { int const threadsPerBlock = sizeof(unsigned long long) * 8; } template __global__ void nms_quadri_musa_kernel(const int n_boxes, const float iou_threshold, const T* dev_boxes, unsigned long long* dev_mask, const int multi_label) { if (multi_label == 1) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 8 values // (x1, y1, ..., x4, y4) here. __shared__ T block_boxes[threadsPerBlock * 8]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 8 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 0]; block_boxes[threadIdx.x * 8 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 1]; block_boxes[threadIdx.x * 8 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 2]; block_boxes[threadIdx.x * 8 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 3]; block_boxes[threadIdx.x * 8 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 4]; block_boxes[threadIdx.x * 8 + 5] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 5]; block_boxes[threadIdx.x * 8 + 6] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 6]; block_boxes[threadIdx.x * 8 + 7] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 9 + 7]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 9; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_quadri function from // box_iou_rotated_utils.h if (single_box_iou_quadri(cur_box, block_boxes + i * 8, 0) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = divideUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } else { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 8 values // (x1, y1, , ..., x4, y4) here. __shared__ T block_boxes[threadsPerBlock * 8]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 8 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 0]; block_boxes[threadIdx.x * 8 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 1]; block_boxes[threadIdx.x * 8 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 2]; block_boxes[threadIdx.x * 8 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 3]; block_boxes[threadIdx.x * 8 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 4]; block_boxes[threadIdx.x * 8 + 5] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 5]; block_boxes[threadIdx.x * 8 + 6] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 6]; block_boxes[threadIdx.x * 8 + 7] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 8 + 7]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 8; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_quadri function from // box_iou_rotated_utils.h if (single_box_iou_quadri(cur_box, block_boxes + i * 8, 0) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = divideUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } } #endif ================================================ FILE: mmcv/ops/csrc/common/musa/nms_rotated_musa.muh ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved // modified from // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu #ifndef NMS_ROTATED_MUSA_MUH #define NMS_ROTATED_MUSA_MUH #include "pytorch_musa_helper.hpp" #include "box_iou_rotated_utils.hpp" __host__ __device__ inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); } namespace { int const threadsPerBlock = sizeof(unsigned long long) * 8; } template __global__ void nms_rotated_musa_kernel(const int n_boxes, const float iou_threshold, const T* dev_boxes, unsigned long long* dev_mask, const int multi_label) { // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel if (multi_label == 1) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 5 values // (x_center, y_center, width, height, angle_degrees) here. __shared__ T block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 6; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_rotated function from // box_iou_rotated_utils.h if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = divideUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } else { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); // Compared to nms_cuda_kernel, where each box is represented with 4 values // (x1, y1, x2, y2), each rotated box is represented with 5 values // (x_center, y_center, width, height, angle_degrees) here. __shared__ T block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const T* cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_rotated function from // box_iou_rotated_utils.h if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) > iou_threshold) { t |= 1ULL << i; } } const int col_blocks = divideUP(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } } #endif ================================================ FILE: mmcv/ops/csrc/common/musa/points_in_boxes_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef POINT_IN_BOXES_MUSA_KERNEL_MUH #define POINT_IN_BOXES_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, T &local_x, T &local_y) { T cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } template __device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, T &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, // cz in the bottom center T x = pt[0], y = pt[1], z = pt[2]; T cx = box3d[0], cy = box3d[1], cz = box3d[2]; T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > z_size / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); return in_flag; } template __global__ void points_in_boxes_part_forward_musa_kernel( int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts, int *box_idx_of_points) { // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // coordinate, z is the bottom center, each box DO NOT overlaps params pts: // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points: // (B, npoints), default -1 int bs_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) { if (bs_idx >= batch_size) return; boxes += bs_idx * boxes_num * 7; pts += bs_idx * pts_num * 3 + pt_idx * 3; box_idx_of_points += bs_idx * pts_num + pt_idx; T local_x = 0, local_y = 0; int cur_in_flag = 0; for (int k = 0; k < boxes_num; k++) { cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); if (cur_in_flag) { box_idx_of_points[0] = k; break; } } } } template __global__ void points_in_boxes_all_forward_musa_kernel( int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts, int *box_idx_of_points) { // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // coordinate, z is the bottom center, each box DO NOT overlaps params pts: // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points: // (B, npoints), default -1 int bs_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) { if (bs_idx >= batch_size) return; boxes += bs_idx * boxes_num * 7; pts += bs_idx * pts_num * 3 + pt_idx * 3; box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num; T local_x = 0, local_y = 0; for (int k = 0; k < boxes_num; k++) { const int cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); if (cur_in_flag) { box_idx_of_points[k] = 1; } } } } #endif // POINT_IN_BOXES_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/points_in_polygons_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef POINTS_IN_POLYGONS_MUSA_KERNEL_MUH #define POINTS_IN_POLYGONS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" struct point { float x, y; }; template __global__ void points_in_polygons_forward_musa_kernel( const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2, const int rows, const int cols, scalar_t *inside_flag) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int row = index / cols; int col = index % cols; const scalar_t *offset_vertex1 = vertex1 + row * 2; const scalar_t *offset_vertex2 = vertex2 + col * 8; point point_[1]; point polygon[4]; point_[0].x = offset_vertex1[0]; point_[0].y = offset_vertex1[1]; polygon[0].x = offset_vertex2[0]; polygon[0].y = offset_vertex2[1]; polygon[1].x = offset_vertex2[2]; polygon[1].y = offset_vertex2[3]; polygon[2].x = offset_vertex2[4]; polygon[2].y = offset_vertex2[5]; polygon[3].x = offset_vertex2[6]; polygon[3].y = offset_vertex2[7]; int nCross = 0; int i, j; float sx, sy, tx, ty, px, py, x; for (i = 0, j = 3; i < 4; j = i, i++) { sx = polygon[i].x; sy = polygon[i].y; tx = polygon[j].x; ty = polygon[j].y; px = point_[0].x; py = point_[0].y; if (py < min(sy, ty)) continue; if (py > max(sy, ty)) continue; if ((sx == px && sy == py) || (tx == px && ty == py)) { break; } else { if ((sy < py && ty >= py) || (sy >= py && ty < py)) { x = sx + (py - sy) * (tx - sx) / (ty - sy); if (x == px) { break; } if (x > px) { nCross++; } } } } if (nCross % 2 == 1) { inside_flag[index] = 1.0; } else { inside_flag[index] = 0.0; } return; } } #endif // POINTS_IN_POLYGONS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/prroi_pool_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu // Distributed under terms of the MIT license. #ifndef PRROI_POOL_MUSA_KERNEL_MUH #define PRROI_POOL_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __device__ static __forceinline__ T PrRoIPoolingGetData(const T *data, const int h, const int w, const int height, const int width) { bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); T retVal = overflow ? 0.0f : data[h * width + w]; return retVal; } template __device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) { return (1.0f - abs(dh)) * (1.0f - abs(dw)); } template __device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t, T c1, T c2) { return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1; } template __device__ static T PrRoIPoolingInterpolation(const T *data, const T h, const T w, const int height, const int width) { T retVal = 0.0f; int h1 = floorf(h); int w1 = floorf(w); retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); h1 = floorf(h) + 1; w1 = floorf(w); retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); h1 = floorf(h); w1 = floorf(w) + 1; retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); h1 = floorf(h) + 1; w1 = floorf(w) + 1; retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); return retVal; } template __device__ static T PrRoIPoolingMatCalculation(const T *this_data, const int s_h, const int s_w, const int e_h, const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0, const int w0) { T alpha, beta, lim_alpha, lim_beta, tmp; T sum_out = 0; alpha = x0 - T(s_w); beta = y0 - T(s_h); lim_alpha = x1 - T(s_w); lim_beta = y1 - T(s_h); tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp; alpha = T(e_w) - x1; lim_alpha = T(e_w) - x0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp; alpha = x0 - T(s_w); beta = T(e_h) - y1; lim_alpha = x1 - T(s_w); lim_beta = T(e_h) - y0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp; alpha = T(e_w) - x1; lim_alpha = T(e_w) - x0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp; return sum_out; } template __device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff, const int h, const int w, const int height, const int width, const T coeff) { bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff); } template __device__ static void PrRoIPoolingMatDistributeDiff( T *diff, const T top_diff, const int s_h, const int s_w, const int e_h, const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0, const int w0) { T alpha, beta, lim_alpha, lim_beta, tmp; alpha = x0 - T(s_w); beta = y0 - T(s_h); lim_alpha = x1 - T(s_w); lim_beta = y1 - T(s_h); tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp); alpha = T(e_w) - x1; lim_alpha = T(e_w) - x0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp); alpha = x0 - T(s_w); beta = T(e_h) - y1; lim_alpha = x1 - T(s_w); lim_beta = T(e_h) - y0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp); alpha = T(e_w) - x1; lim_alpha = T(e_w) - x0; tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + 0.5f * alpha * alpha) * (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp); } template __global__ void prroi_pool_forward_musa_kernel( const int nthreads, const T *input, const T *rois, T *output, const int pooled_height, const int pooled_width, const T spatial_scale, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T *offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; T roi_x1 = offset_rois[1] * spatial_scale; T roi_y1 = offset_rois[2] * spatial_scale; T roi_x2 = offset_rois[3] * spatial_scale; T roi_y2 = offset_rois[4] * spatial_scale; T roi_width = max(roi_x2 - roi_x1, ((T)0.0)); T roi_height = max(roi_y2 - roi_y1, ((T)0.0)); T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); const T *this_data = input + (roi_batch_ind * channels + c) * height * width; T *this_out = output + index; T bin_x1 = roi_x1 + bin_size_w * pw; T bin_y1 = roi_y1 + bin_size_h * ph; T bin_x2 = bin_x1 + bin_size_w; T bin_y2 = bin_y1 + bin_size_h; T bin_size = max(T(0.0), bin_size_w * bin_size_h); if (bin_size == 0) { *this_out = 0; continue; } T sum_out = 0; int start_x, start_y, end_x, end_y; start_x = floorf(bin_x1); end_x = ceilf(bin_x2); start_y = floorf(bin_y1); end_y = ceilf(bin_y2); for (int bin_x = start_x; bin_x < end_x; ++bin_x) for (int bin_y = start_y; bin_y < end_y; ++bin_y) sum_out += PrRoIPoolingMatCalculation( this_data, bin_y, bin_x, bin_y + 1, bin_x + 1, max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)), min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height, width); *this_out = sum_out / bin_size; } } template __global__ void prroi_pool_backward_musa_kernel( const int nthreads, const T *grad_output, const T *rois, T *grad_input, const int pooled_height, const int pooled_width, const T spatial_scale, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; auto rois_cur = rois + n * 5; int roi_batch_ind = rois_cur[0]; T roi_x1 = rois_cur[1] * spatial_scale; T roi_y1 = rois_cur[2] * spatial_scale; T roi_x2 = rois_cur[3] * spatial_scale; T roi_y2 = rois_cur[4] * spatial_scale; T roi_width = max(roi_x2 - roi_x1, (T)0); T roi_height = max(roi_y2 - roi_y1, (T)0); T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); const T *this_out_grad = grad_output + index; T *this_data_grad = grad_input + (roi_batch_ind * channels + c) * height * width; T bin_x1 = roi_x1 + bin_size_w * pw; T bin_y1 = roi_y1 + bin_size_h * ph; T bin_x2 = bin_x1 + bin_size_w; T bin_y2 = bin_y1 + bin_size_h; T bin_size = max(T(0.0), bin_size_w * bin_size_h); T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size; int start_x, start_y, end_x, end_y; start_x = floorf(bin_x1); end_x = ceilf(bin_x2); start_y = floorf(bin_y1); end_y = ceilf(bin_y2); for (int bin_x = start_x; bin_x < end_x; ++bin_x) for (int bin_y = start_y; bin_y < end_y; ++bin_y) PrRoIPoolingMatDistributeDiff( this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1, max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)), min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height, width); } } template __global__ void prroi_pool_coor_backward_musa_kernel( const int nthreads, const T *output, const T *grad_output, const T *input, const T *rois, T *grad_rois, const int pooled_height, const int pooled_width, const T spatial_scale, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; auto rois_cur = rois + n * 5; int roi_batch_ind = rois_cur[0]; T roi_x1 = rois_cur[1] * spatial_scale; T roi_y1 = rois_cur[2] * spatial_scale; T roi_x2 = rois_cur[3] * spatial_scale; T roi_y2 = rois_cur[4] * spatial_scale; T roi_width = max(roi_x2 - roi_x1, (T)0); T roi_height = max(roi_y2 - roi_y1, (T)0); T bin_size_h = roi_height / static_cast(pooled_height); T bin_size_w = roi_width / static_cast(pooled_width); const T output_grad_val = grad_output[index]; const T *this_input_data = input + (roi_batch_ind * channels + c) * height * width; const T output_val = output[index]; T *this_rois_grad = grad_rois + n * 5; T bin_x1 = roi_x1 + bin_size_w * pw; T bin_y1 = roi_y1 + bin_size_h * ph; T bin_x2 = bin_x1 + bin_size_w; T bin_y2 = bin_y1 + bin_size_h; T bin_size = max(T(0.0), bin_size_w * bin_size_h); T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size; // WARNING: to be discussed if (sum_out == 0) continue; int start_x, start_y, end_x, end_y; start_x = floorf(bin_x1); end_x = ceilf(bin_x2); start_y = floorf(bin_y1); end_y = ceilf(bin_y2); T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0; for (int bin_y = start_y; bin_y < end_y; ++bin_y) { grad_x1_y += PrRoIPoolingSingleCoorIntegral( max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y, PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1, height, width), PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1, height, width)); grad_x2_y += PrRoIPoolingSingleCoorIntegral( max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y, PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2, height, width), PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2, height, width)); } for (int bin_x = start_x; bin_x < end_x; ++bin_x) { grad_x_y1 += PrRoIPoolingSingleCoorIntegral( max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x, PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x), height, width), PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1), height, width)); grad_x_y2 += PrRoIPoolingSingleCoorIntegral( max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x, PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x), height, width), PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1), height, width)); } T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val; T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val; T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val; T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val; partial_x1 = partial_x1 / bin_size * spatial_scale; partial_x2 = partial_x2 / bin_size * spatial_scale; partial_y1 = partial_y1 / bin_size * spatial_scale; partial_y2 = partial_y2 / bin_size * spatial_scale; // (index, x1, y1, x2, y2) this_rois_grad[0] = 0; atomicAdd(this_rois_grad + 1, (partial_x1 * (1.0f - T(pw) / pooled_width) + partial_x2 * (1.0f - T(pw + 1) / pooled_width)) * output_grad_val); atomicAdd(this_rois_grad + 2, (partial_y1 * (1.0f - T(ph) / pooled_height) + partial_y2 * (1.0f - T(ph + 1) / pooled_height)) * output_grad_val); atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width + partial_x1 * T(pw) / pooled_width) * output_grad_val); atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height + partial_y1 * T(ph) / pooled_height) * output_grad_val); } } #endif // ROI_POOL_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/psamask_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef PSAMASK_MUSA_KERNEL_MUH #define PSAMASK_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" // MUSA: grid stride looping #ifndef MUSA_KERNEL_LOOP #define MUSA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) #endif template __global__ void psamask_collect_forward_musa( const int nthreads, const int h_feature, const int w_feature, const int h_mask, const int w_mask, const int half_h_mask, const int half_w_mask, const T* mask_data, T* buffer_data) { MUSA_KERNEL_LOOP(index, nthreads) { const int w = index % w_feature; const int h = (index / w_feature) % h_feature; const int n = index / w_feature / h_feature; // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed const int hstart = max(0, half_h_mask - h); const int hend = min(h_mask, h_feature + half_h_mask - h); const int wstart = max(0, half_w_mask - w); const int wend = min(w_mask, w_feature + half_w_mask - w); // (hidx, widx ) with mask-indexed // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed for (int hidx = hstart; hidx < hend; hidx++) { for (int widx = wstart; widx < wend; widx++) { buffer_data[(n * h_feature * w_feature + (hidx + h - half_h_mask) * w_feature + (widx + w - half_w_mask)) * h_feature * w_feature + h * w_feature + w] = mask_data [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * w_feature + w]; } } } } template __global__ void psamask_distribute_forward_musa( const int nthreads, const int h_feature, const int w_feature, const int h_mask, const int w_mask, const int half_h_mask, const int half_w_mask, const T* mask_data, T* buffer_data) { MUSA_KERNEL_LOOP(index, nthreads) { const int w = index % w_feature; const int h = (index / w_feature) % h_feature; const int n = index / w_feature / h_feature; // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed const int hstart = max(0, half_h_mask - h); const int hend = min(h_mask, h_feature + half_h_mask - h); const int wstart = max(0, half_w_mask - w); const int wend = min(w_mask, w_feature + half_w_mask - w); // (hidx, widx ) with mask-indexed // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed for (int hidx = hstart; hidx < hend; hidx++) { for (int widx = wstart; widx < wend; widx++) { buffer_data[(n * h_feature * w_feature + h * w_feature + w) * h_feature * w_feature + (hidx + h - half_h_mask) * w_feature + (widx + w - half_w_mask)] = mask_data [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * w_feature + w]; } } } } template __global__ void psamask_collect_backward_musa( const int nthreads, const int h_feature, const int w_feature, const int h_mask, const int w_mask, const int half_h_mask, const int half_w_mask, const T* buffer_diff, T* mask_diff) { MUSA_KERNEL_LOOP(index, nthreads) { const int w = index % w_feature; const int h = (index / w_feature) % h_feature; const int n = index / w_feature / h_feature; // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed const int hstart = max(0, half_h_mask - h); const int hend = min(h_mask, h_feature + half_h_mask - h); const int wstart = max(0, half_w_mask - w); const int wend = min(w_mask, w_feature + half_w_mask - w); // (hidx, widx ) with mask-indexed // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed for (int hidx = hstart; hidx < hend; hidx++) { for (int widx = wstart; widx < wend; widx++) { mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * w_feature + w] = buffer_diff[(n * h_feature * w_feature + (hidx + h - half_h_mask) * w_feature + (widx + w - half_w_mask)) * h_feature * w_feature + h * w_feature + w]; } } } } template __global__ void psamask_distribute_backward_musa( const int nthreads, const int h_feature, const int w_feature, const int h_mask, const int w_mask, const int half_h_mask, const int half_w_mask, const T* buffer_diff, T* mask_diff) { MUSA_KERNEL_LOOP(index, nthreads) { const int w = index % w_feature; const int h = (index / w_feature) % h_feature; const int n = index / w_feature / h_feature; // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed const int hstart = max(0, half_h_mask - h); const int hend = min(h_mask, h_feature + half_h_mask - h); const int wstart = max(0, half_w_mask - w); const int wend = min(w_mask, w_feature + half_w_mask - w); // (hidx, widx ) with mask-indexed // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed for (int hidx = hstart; hidx < hend; hidx++) { for (int widx = wstart; widx < wend; widx++) { mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) * w_feature + w] = buffer_diff[(n * h_feature * w_feature + h * w_feature + w) * h_feature * w_feature + (hidx + h - half_h_mask) * w_feature + (widx + w - half_w_mask)]; } } } } #endif // PSAMASK_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/riroi_align_rotated_musa_kernel.muh ================================================ // Modified from // https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu #ifndef RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH #define RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH #include #include "pytorch_musa_helper.hpp" /*** Forward ***/ template __global__ void riroi_align_rotated_forward_musa_kernel( const int nthreads, const scalar_t *bottom_data, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int num_samples, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int num_orientations, scalar_t *top_data) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int o = (index / pooled_width / pooled_height) % num_orientations; int c = (index / pooled_width / pooled_height / num_orientations) % channels; int n = index / pooled_width / pooled_height / num_orientations / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 6; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale; scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale; scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; scalar_t theta = offset_bottom_rois[5]; // Force malformed ROIs to be 1x1 roi_width = max(roi_width, (scalar_t)1.); roi_height = max(roi_height, (scalar_t)1.); scalar_t bin_size_h = static_cast(roi_height) / static_cast(pooled_height); scalar_t bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // find aligned index scalar_t ind_float = theta * num_orientations / (2 * M_PI); int ind = floorf(ind_float); scalar_t l_var = ind_float - (scalar_t)ind; scalar_t r_var = 1.0 - l_var; // correct start channel ind = (ind + num_orientations) % num_orientations; // rotated channel int ind_rot = (o - ind + num_orientations) % num_orientations; int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations; const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels * num_orientations + c * num_orientations + ind_rot) * height * width; const scalar_t *offset_bottom_data_plus = bottom_data + (roi_batch_ind * channels * num_orientations + c * num_orientations + ind_rot_plus) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (num_samples > 0) ? num_samples : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. if (clockwise) { theta = -theta; // If clockwise, the angle needs to be reversed. } scalar_t roi_start_h = -roi_height / 2.0; scalar_t roi_start_w = -roi_width / 2.0; scalar_t cosscalar_theta = cos(theta); scalar_t sinscalar_theta = sin(theta); // We do average (integral) pooling inside a bin const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 scalar_t output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 const scalar_t yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const scalar_t xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta (counterclockwise) around the center and translate scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; scalar_t val = bilinear_interpolate( offset_bottom_data, height, width, y, x, index); scalar_t val_plus = bilinear_interpolate( offset_bottom_data_plus, height, width, y, x, index); output_val += r_var * val + l_var * val_plus; } } output_val /= count; top_data[index] = output_val; } } /*** Backward ***/ template __global__ void riroi_align_rotated_backward_musa_kernel( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int num_samples, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, const int num_orientations, scalar_t *bottom_diff) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int o = (index / pooled_width / pooled_height) % num_orientations; int c = (index / pooled_width / pooled_height / num_orientations) % channels; int n = index / pooled_width / pooled_height / num_orientations / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 6; int roi_batch_ind = offset_bottom_rois[0]; // Do not round scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale; scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale; scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; scalar_t theta = offset_bottom_rois[5]; // Force malformed ROIs to be 1x1 roi_width = max(roi_width, (scalar_t)1.); roi_height = max(roi_height, (scalar_t)1.); scalar_t bin_size_h = static_cast(roi_height) / static_cast(pooled_height); scalar_t bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // find aligned index scalar_t ind_float = theta * num_orientations / (2 * M_PI); int ind = floorf(ind_float); scalar_t l_var = ind_float - (scalar_t)ind; scalar_t r_var = 1.0 - l_var; // correct start channel ind = (ind + num_orientations) % num_orientations; // rotated channel int ind_rot = (o - ind + num_orientations) % num_orientations; int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations; scalar_t *offset_bottom_diff = bottom_diff + (roi_batch_ind * channels * num_orientations + c * num_orientations + ind_rot) * height * width; scalar_t *offset_bottom_diff_plus = bottom_diff + (roi_batch_ind * channels * num_orientations + c * num_orientations + ind_rot_plus) * height * width; int top_offset = (n * channels * num_orientations + c * num_orientations + o) * pooled_height * pooled_width; const scalar_t *offset_top_diff = top_diff + top_offset; const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (num_samples > 0) ? num_samples : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. if (clockwise) { theta = -theta; // If clockwise, the angle needs to be reversed. } scalar_t roi_start_h = -roi_height / 2.0; scalar_t roi_start_w = -roi_width / 2.0; scalar_t cosTheta = cos(theta); scalar_t sinTheta = sin(theta); // We do average (integral) pooling inside a bin const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 const scalar_t yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const scalar_t xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta around the center and translate scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; scalar_t w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); scalar_t g1 = top_diff_this_bin * w1 / count; scalar_t g2 = top_diff_this_bin * w2 / count; scalar_t g3 = top_diff_this_bin * w3 / count; scalar_t g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var); atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var); atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var); atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var); atomicAdd(offset_bottom_diff_plus + y_low * width + x_low, g1 * l_var); atomicAdd(offset_bottom_diff_plus + y_low * width + x_high, g2 * l_var); atomicAdd(offset_bottom_diff_plus + y_high * width + x_low, g3 * l_var); atomicAdd(offset_bottom_diff_plus + y_high * width + x_high, g4 * l_var); } // if } // ix } // iy } // MUSA_1D_KERNEL_LOOP } // RiRoIAlignBackward #endif // RIROI_ALIGN_ROTATED_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/roi_align_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ROI_ALIGN_MUSA_KERNEL_MUH #define ROI_ALIGN_MUSA_KERNEL_MUH #include #include "pytorch_musa_helper.hpp" /*** Forward ***/ template __global__ void roi_align_forward_musa_kernel( const int nthreads, const T* input, const T* rois, T* output, T* argmax_y, T* argmax_x, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, const int pool_mode, // 0 - max pool, 1 - avg pool const bool aligned, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; // Do not using rounding; this implementation detail is critical T offset = aligned ? (T)0.5 : (T)0.0; T roi_start_w = offset_rois[1] * spatial_scale - offset; T roi_start_h = offset_rois[2] * spatial_scale - offset; T roi_end_w = offset_rois[3] * spatial_scale - offset; T roi_end_h = offset_rois[4] * spatial_scale - offset; T roi_width = roi_end_w - roi_start_w; T roi_height = roi_end_h - roi_start_h; if (!aligned) { // for backward-compatibility only roi_width = max(roi_width, (T)1.); roi_height = max(roi_height, (T)1.); } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const T* offset_input = input + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_height / pooled_height)); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_width / pooled_width)); if (pool_mode == 0) { // We do max pooling inside a bin T maxval = -FLT_MAX; T maxidx_y = -1.f, maxidx_x = -1.f; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_input, height, width, y, x, index); if (val > maxval) { maxval = val; maxidx_y = y; maxidx_x = x; } } } output[index] = maxval; argmax_y[index] = maxidx_y; argmax_x[index] = maxidx_x; } else if (pool_mode == 1) { // We do average pooling inside a bin const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); T output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T val = bilinear_interpolate(offset_input, height, width, y, x, index); output_val += val; } } output[index] = output_val / count; } } } /*** Backward ***/ template __global__ void roi_align_backward_musa_kernel( const int nthreads, const T* grad_output, const T* rois, const T* argmax_y, const T* argmax_x, T* grad_input, const int pooled_height, const int pooled_width, const T spatial_scale, const int sampling_ratio, const int pool_mode, // 0 - max pool, 1 - avg pool const bool aligned, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T grad_output_this_bin = grad_output[index]; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; T* offset_grad_input = grad_input + ((roi_batch_ind * channels + c) * height * width); if (pool_mode == 0) { T y = argmax_y[index], x = argmax_x[index]; if (y != -1.f) { T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_grad_input + y_low * width + x_low, grad_output_this_bin * w1); atomicAdd(offset_grad_input + y_low * width + x_high, grad_output_this_bin * w2); atomicAdd(offset_grad_input + y_high * width + x_low, grad_output_this_bin * w3); atomicAdd(offset_grad_input + y_high * width + x_high, grad_output_this_bin * w4); } } } else if (pool_mode == 1) { // Do not using rounding; this implementation detail is critical T offset = aligned ? (T)0.5 : (T)0.0; T roi_start_w = offset_rois[1] * spatial_scale - offset; T roi_start_h = offset_rois[2] * spatial_scale - offset; T roi_end_w = offset_rois[3] * spatial_scale - offset; T roi_end_h = offset_rois[4] * spatial_scale - offset; T roi_width = roi_end_w - roi_start_w; T roi_height = roi_end_h - roi_start_h; if (!aligned) { // for backward-compatibility only roi_width = max(roi_width, (T)1.); roi_height = max(roi_height, (T)1.); } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_height / pooled_height)); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : static_cast(ceilf(roi_width / pooled_width)); // We do average (integral) pooling inside a bin const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); for (int ix = 0; ix < roi_bin_grid_w; ix++) { const T x = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); T w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_grad_input + y_low * width + x_low, grad_output_this_bin * w1 / count); atomicAdd(offset_grad_input + y_low * width + x_high, grad_output_this_bin * w2 / count); atomicAdd(offset_grad_input + y_high * width + x_low, grad_output_this_bin * w3 / count); atomicAdd(offset_grad_input + y_high * width + x_high, grad_output_this_bin * w4 / count); } } } } } } #endif // ROI_ALIGN_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/roi_align_rotated_musa_kernel.muh ================================================ // Modified from // https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved #ifndef ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH #define ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH #include #include "pytorch_musa_helper.hpp" /*** Forward ***/ template __global__ void roi_align_rotated_forward_musa_kernel( const int nthreads, const scalar_t *bottom_data, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int sampling_ratio, const bool aligned, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, scalar_t *top_data) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 6; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; scalar_t theta = offset_bottom_rois[5]; if (clockwise) { theta = -theta; // If clockwise, the angle needs to be reversed. } if (!aligned) { // for backward-compatibility only // Force malformed ROIs to be 1x1 roi_width = max(roi_width, (scalar_t)1.); roi_height = max(roi_height, (scalar_t)1.); } scalar_t bin_size_h = static_cast(roi_height) / static_cast(pooled_height); scalar_t bin_size_w = static_cast(roi_width) / static_cast(pooled_width); const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. scalar_t roi_start_h = -roi_height / 2.0; scalar_t roi_start_w = -roi_width / 2.0; scalar_t cosscalar_theta = cos(theta); scalar_t sinscalar_theta = sin(theta); // We do average (integral) pooling inside a bin const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 scalar_t output_val = 0.; for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 const scalar_t yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const scalar_t xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta (counterclockwise) around the center and translate scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; scalar_t val = bilinear_interpolate( offset_bottom_data, height, width, y, x, index); output_val += val; } } output_val /= count; top_data[index] = output_val; } } /*** Backward ***/ template __global__ void roi_align_rotated_backward_musa_kernel( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int sampling_ratio, const bool aligned, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, scalar_t *bottom_diff) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const scalar_t *offset_bottom_rois = bottom_rois + n * 6; int roi_batch_ind = offset_bottom_rois[0]; // Do not round scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0; scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset; scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset; scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; scalar_t theta = offset_bottom_rois[5]; if (clockwise) { theta = -theta; // If clockwise, the angle needs to be reversed. } if (!aligned) { // for backward-compatibility only // Force malformed ROIs to be 1x1 roi_width = max(roi_width, (scalar_t)1.); roi_height = max(roi_height, (scalar_t)1.); } scalar_t bin_size_h = static_cast(roi_height) / static_cast(pooled_height); scalar_t bin_size_w = static_cast(roi_width) / static_cast(pooled_width); scalar_t *offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width; int top_offset = (n * channels + c) * pooled_height * pooled_width; const scalar_t *offset_top_diff = top_diff + top_offset; const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. scalar_t roi_start_h = -roi_height / 2.0; scalar_t roi_start_w = -roi_width / 2.0; scalar_t cosTheta = cos(theta); scalar_t sinTheta = sin(theta); // We do average (integral) pooling inside a bin const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 const scalar_t yy = roi_start_h + ph * bin_size_h + static_cast(iy + .5f) * bin_size_h / static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 for (int ix = 0; ix < roi_bin_grid_w; ix++) { const scalar_t xx = roi_start_w + pw * bin_size_w + static_cast(ix + .5f) * bin_size_w / static_cast(roi_bin_grid_w); // Rotate by theta around the center and translate scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; scalar_t w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high, index); scalar_t g1 = top_diff_this_bin * w1 / count; scalar_t g2 = top_diff_this_bin * w2 / count; scalar_t g3 = top_diff_this_bin * w3 / count; scalar_t g4 = top_diff_this_bin * w4 / count; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); } // if } // ix } // iy } // MUSA_1D_KERNEL_LOOP } // RoIAlignBackward #endif // ROI_ALIGN_ROTATED_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/roi_pool_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ROI_POOL_MUSA_KERNEL_MUH #define ROI_POOL_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void roi_pool_forward_musa_kernel( const int nthreads, const T* input, const T* rois, T* output, int* argmax, const int pooled_height, const int pooled_width, const T spatial_scale, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the pooled output int pw = index % pooled_width; int ph = (index / pooled_width) % pooled_height; int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; const T* offset_rois = rois + n * 5; int roi_batch_ind = offset_rois[0]; // calculate the roi region on feature maps T roi_x1 = offset_rois[1] * spatial_scale; T roi_y1 = offset_rois[2] * spatial_scale; T roi_x2 = (offset_rois[3] + 1) * spatial_scale; T roi_y2 = (offset_rois[4] + 1) * spatial_scale; // force malformed rois to be 1x1 T roi_w = roi_x2 - roi_x1; T roi_h = roi_y2 - roi_y1; if (roi_w <= 0 || roi_h <= 0) continue; T bin_size_w = roi_w / static_cast(pooled_width); T bin_size_h = roi_h / static_cast(pooled_height); // the corresponding bin region int bin_x1 = floorf(static_cast(pw) * bin_size_w + roi_x1); int bin_y1 = floorf(static_cast(ph) * bin_size_h + roi_y1); int bin_x2 = ceilf(static_cast(pw + 1) * bin_size_w + roi_x1); int bin_y2 = ceilf(static_cast(ph + 1) * bin_size_h + roi_y1); // add roi offsets and clip to input boundaries bin_x1 = min(max(bin_x1, 0), width); bin_y1 = min(max(bin_y1, 0), height); bin_x2 = min(max(bin_x2, 0), width); bin_y2 = min(max(bin_y2, 0), height); bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1); const T* offset_input = input + (roi_batch_ind * channels + c) * height * width; // Define an empty pooling region to be zero // If nothing is pooled, argmax = -1 causes nothing to be backprop'd T max_val = is_empty ? 0 : -FLT_MAX; int max_idx = -1; for (int h = bin_y1; h < bin_y2; ++h) { for (int w = bin_x1; w < bin_x2; ++w) { int offset = h * width + w; if (offset_input[offset] > max_val) { max_val = offset_input[offset]; max_idx = offset; } } } output[index] = max_val; if (argmax != NULL) argmax[index] = max_idx; } } template __global__ void roi_pool_backward_musa_kernel( const int nthreads, const T* grad_output, const T* rois, const int* argmax, T* grad_input, const int pooled_height, const int pooled_width, const int channels, const int height, const int width) { MUSA_1D_KERNEL_LOOP(index, nthreads) { // (n, c) is an element in the pooled output int c = (index / pooled_width / pooled_height) % channels; int n = index / pooled_width / pooled_height / channels; int roi_batch_ind = rois[n * 5]; T* grad_input_offset = grad_input + ((roi_batch_ind * channels + c) * height * width); int argmax_index = argmax[index]; if (argmax_index != -1) { atomicAdd(grad_input_offset + argmax_index, grad_output[index]); } } } #endif // ROI_POOL_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/roiaware_pool3d_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ROIAWARE_POOL3D_MUSA_KERNEL_MUH #define ROIAWARE_POOL3D_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, T &local_x, T &local_y) { T cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } template __device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, T &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate, // cz in the bottom center T x = pt[0], y = pt[1], z = pt[2]; T cx = box3d[0], cy = box3d[1], cz = box3d[2]; T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6]; cz += z_size / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > z_size / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) & (local_y > -y_size / 2.0) & (local_y < y_size / 2.0); return in_flag; } template __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, int out_x, int out_y, int out_z, const T *rois, const T *pts, int *pts_mask) { // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N, // npoints): -1 means point does not in this box, otherwise: encode (x_idxs, // y_idxs, z_idxs) by binary bit int box_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) { if (box_idx >= boxes_num) return; pts += pt_idx * 3; rois += box_idx * 7; pts_mask += box_idx * pts_num + pt_idx; T local_x = 0, local_y = 0; int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y); pts_mask[0] = -1; if (cur_in_flag > 0) { T local_z = pts[2] - rois[2]; T x_size = rois[3], y_size = rois[4], z_size = rois[5]; T x_res = x_size / out_x; T y_res = y_size / out_y; T z_res = z_size / out_z; unsigned int x_idx = int((local_x + x_size / 2) / x_res); unsigned int y_idx = int((local_y + y_size / 2) / y_res); unsigned int z_idx = int(local_z / z_res); x_idx = min(max(x_idx, 0), out_x - 1); y_idx = min(max(y_idx, 0), out_y - 1); z_idx = min(max(z_idx, 0), out_z - 1); unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; pts_mask[0] = idx_encoding; } } } template __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, int max_pts_each_voxel, int out_x, int out_y, int out_z, const int *pts_mask, T *pts_idx_of_voxels) { // params pts_mask: (N, npoints) 0 or 1 // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) MUSA_1D_KERNEL_LOOP(box_idx, boxes_num) { int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel; for (int k = 0; k < pts_num; k++) { if (pts_mask[box_idx * pts_num + k] != -1) { unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; unsigned int x_idx = (idx_encoding >> 16) & 0xFF; unsigned int y_idx = (idx_encoding >> 8) & 0xFF; unsigned int z_idx = idx_encoding & 0xFF; unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + y_idx * out_z * max_pts_each_voxel + z_idx * max_pts_each_voxel; unsigned int cnt = pts_idx_of_voxels[base_offset]; if (cnt < max_num_pts) { pts_idx_of_voxels[base_offset + cnt + 1] = k; pts_idx_of_voxels[base_offset]++; } } } } } template __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, const T *pts_feature, const int *pts_idx_of_voxels, T *pooled_features, int *argmax) { // params pts_feature: (npoints, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C) // params argmax: (N, out_x, out_y, out_z, C) int box_idx = blockIdx.z; int channel_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; int argmax_idx = -1; float max_val = -1e50; int total_pts = pts_idx_of_voxels[0]; for (int k = 1; k <= total_pts; k++) { if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) { max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; argmax_idx = pts_idx_of_voxels[k]; } } if (argmax_idx != -1) { pooled_features[0] = max_val; } argmax[0] = argmax_idx; } } template __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x, int out_y, int out_z, const T *pts_feature, const int *pts_idx_of_voxels, T *pooled_features) { // params pts_feature: (npoints, C) // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel), // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C) // params argmax: (N, out_x, out_y, out_z, C) int box_idx = blockIdx.z; int channel_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; pooled_features += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; float sum_val = 0; int total_pts = pts_idx_of_voxels[0]; for (int k = 1; k <= total_pts; k++) { sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; } if (total_pts > 0) { pooled_features[0] = sum_val / total_pts; } } } template __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z, const int *argmax, const T *grad_out, T *grad_in) { // params argmax: (N, out_x, out_y, out_z, C) // params grad_out: (N, out_x, out_y, out_z, C) // params grad_in: (npoints, C), return value int box_idx = blockIdx.z; int channel_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; if (argmax[0] == -1) return; atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); } } template __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, int out_x, int out_y, int out_z, int max_pts_each_voxel, const int *pts_idx_of_voxels, const T *grad_out, T *grad_in) { // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) // params grad_out: (N, out_x, out_y, out_z, C) // params grad_in: (npoints, C), return value int box_idx = blockIdx.z; int channel_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { int x_idx = voxel_idx_flat / (out_y * out_z); int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; int z_idx = voxel_idx_flat % out_z; if (box_idx >= boxes_num || channel_idx >= channels) return; int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + offset_base * max_pts_each_voxel; grad_out += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; int total_pts = pts_idx_of_voxels[0]; float cur_grad = 1 / fmaxf(float(total_pts), 1.0); for (int k = 1; k <= total_pts; k++) { atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, grad_out[0] * cur_grad); } } } #endif // ROIAWARE_POOL3D_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/roipoint_pool3d_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef ROIPOINT_POOL3D_MUSA_KERNEL_MUH #define ROIPOINT_POOL3D_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz, T &local_x, T &local_y) { T cosa = cos(-rz), sina = sin(-rz); local_x = shift_x * cosa + shift_y * (-sina); local_y = shift_x * sina + shift_y * cosa; } template __device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x, T &local_y) { // param pt: (x, y, z) // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the // bottom center T x = pt[0], y = pt[1], z = pt[2]; T cx = box3d[0], cy = box3d[1], cz = box3d[2]; T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6]; cz += dz / 2.0; // shift to the center since cz in box3d is the bottom center if (fabsf(z - cz) > dz / 2.0) return 0; lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y); T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) & (local_y > -dy / 2.0) & (local_y < dy / 2.0); return in_flag; } template __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, const T *xyz, const T *boxes3d, int *pts_assign) { // params xyz: (B, N, 3) // params boxes3d: (B, M, 7) // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means // background points int box_idx = blockIdx.y; int bs_idx = blockIdx.z; MUSA_1D_KERNEL_LOOP(pt_idx, pts_num) { if (box_idx >= boxes_num || bs_idx >= batch_size) return; int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx; pts_assign[assign_idx] = 0; int box_offset = bs_idx * boxes_num * 7 + box_idx * 7; int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3; T local_x = 0, local_y = 0; int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, local_x, local_y); pts_assign[assign_idx] = cur_in_flag; } } __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, int sampled_pts_num, const int *pts_assign, int *pts_idx, int *pooled_empty_flag) { // params xyz: (B, N, 3) // params pts_feature: (B, N, C) // params pts_assign: (B, N) // params pts_idx: (B, M, 512) // params pooled_empty_flag: (B, M) MUSA_1D_KERNEL_LOOP(boxes_idx, boxes_num) { int bs_idx = blockIdx.y; int cnt = 0; for (int k = 0; k < pts_num; k++) { if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) { if (cnt < sampled_pts_num) { pts_idx[bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num + cnt] = k; cnt++; } else break; } } if (cnt == 0) { pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1; } else if (cnt < sampled_pts_num) { // duplicate same points for sampling for (int k = cnt; k < sampled_pts_num; k++) { int duplicate_idx = k % cnt; int base_offset = bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num; pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx]; } } } } template __global__ void roipoint_pool3d_forward( int batch_size, int pts_num, int boxes_num, int feature_in_len, int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature, T *pooled_features, int *pooled_empty_flag) { // params xyz: (B, N, 3) // params pts_idx: (B, M, 512) // params pts_feature: (B, N, C) // params pooled_features: (B, M, 512, 3+C) // params pooled_empty_flag: (B, M) int box_idx = blockIdx.y; int bs_idx = blockIdx.z; MUSA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) { if (box_idx >= boxes_num || bs_idx >= batch_size) return; if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return; int temp_idx = bs_idx * boxes_num * sampled_pts_num + box_idx * sampled_pts_num + sample_pt_idx; int src_pt_idx = pts_idx[temp_idx]; int dst_feature_offset = temp_idx * (3 + feature_in_len); for (int j = 0; j < 3; j++) pooled_features[dst_feature_offset + j] = xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j]; int src_feature_offset = bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len; memcpy(pooled_features + dst_feature_offset + 3, pts_feature + src_feature_offset, feature_in_len * sizeof(T)); } } #endif // ROIPOINT_POOL3D_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/rotated_feature_align_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu #ifndef ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH #define ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void rotated_feature_align_forward_kernel( const int nthreads, const int points, const scalar_t* bottom_data, const scalar_t* best_bboxes, const scalar_t spatial_scale, const int channels, const int height, const int width, scalar_t* top_data) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int w = index % width; int h = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; const scalar_t* bbox_offset = best_bboxes + ((n * height + h) * width + w) * 5; scalar_t roi_y = bbox_offset[0] * spatial_scale; scalar_t roi_x = bbox_offset[1] * spatial_scale; scalar_t px[5] = {roi_x, 0, 0, 0, 0}; scalar_t py[5] = {roi_y, 0, 0, 0, 0}; if (points > 1) { scalar_t roi_w = bbox_offset[2] * spatial_scale; scalar_t roi_h = bbox_offset[3] * spatial_scale; scalar_t roi_a = bbox_offset[4]; scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2; scalar_t cosa = cosf(roi_a), sina = sinf(roi_a); scalar_t wx = cosa * w_2, wy = sina * w_2; scalar_t hx = -sina * h_2, hy = cosa * h_2; px[1] = roi_x + wx + hx; py[1] = roi_y + wy + hy; px[2] = roi_x - wx + hx; py[2] = roi_y - wy + hy; px[3] = roi_x - wx - hx; py[3] = roi_y - wy - hy; px[4] = roi_x + wx - hx; py[4] = roi_y + wy - hy; } const scalar_t* offset_bottom_data = bottom_data + (n * channels + c) * height * width; scalar_t output_val = bottom_data[index]; for (int i = 0; i < points; i++) { output_val += bilinear_interpolate(offset_bottom_data, height, width, py[i], px[i], i); } top_data[index] = output_val; } } template __global__ void rotated_feature_align_backward_kernel( const int nthreads, const int points, const scalar_t* top_diff, const scalar_t* best_bboxes, const scalar_t spatial_scale, const int channels, const int height, const int width, scalar_t* bottom_diff) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int w = index % width; int h = (index / width) % height; int c = (index / width / height) % channels; int n = index / width / height / channels; const scalar_t* bbox_offset = best_bboxes + ((n * height + h) * width + w) * 5; scalar_t roi_y = bbox_offset[0] * spatial_scale; scalar_t roi_x = bbox_offset[1] * spatial_scale; scalar_t px[5] = {roi_x, 0, 0, 0, 0}; scalar_t py[5] = {roi_y, 0, 0, 0, 0}; if (points > 1) { scalar_t roi_w = bbox_offset[2] * spatial_scale; scalar_t roi_h = bbox_offset[3] * spatial_scale; scalar_t roi_a = bbox_offset[4]; scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2; scalar_t cosa = cosf(roi_a), sina = sinf(roi_a); scalar_t wx = cosa * w_2, wy = sina * w_2; scalar_t hx = -sina * h_2, hy = cosa * h_2; px[1] = roi_x + wx + hx; py[1] = roi_y + wy + hy; px[2] = roi_x - wx + hx; py[2] = roi_y - wy + hy; px[3] = roi_x - wx - hx; py[3] = roi_y - wy - hy; px[4] = roi_x + wx - hx; py[4] = roi_y + wy - hy; } scalar_t* offset_bottom_diff = bottom_diff + (n * channels + c) * height * width; scalar_t value_top_diff = top_diff[index]; atomicAdd(bottom_diff + index, value_top_diff); for (int i = 0; i < points; i++) { scalar_t w1, w2, w3, w4; int x_low, x_high, y_low, y_high; bilinear_interpolate_gradient(height, width, py[i], px[i], w1, w2, w3, w4, x_low, x_high, y_low, y_high, i); scalar_t g1 = value_top_diff * w1; scalar_t g2 = value_top_diff * w2; scalar_t g3 = value_top_diff * w3; scalar_t g4 = value_top_diff * w4; if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); } } } } #endif // ROTATED_FEATURE_ALIGN_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/scatter_points_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef SCATTER_POINTS_MUSA_KERNEL_MUH #define SCATTER_POINTS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t; int const maxGridDim = 50000; __device__ __forceinline__ static void reduceMax(float *address, float val) { int *address_as_i = reinterpret_cast(address); int old = *address_as_i, assumed; do { assumed = old; old = atomicCAS(address_as_i, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed)))); } while (assumed != old || __int_as_float(old) < val); } __device__ __forceinline__ static void reduceMax(double *address, double val) { unsigned long long *address_as_ull = reinterpret_cast(address); unsigned long long old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS( address_as_ull, assumed, __double_as_longlong(fmax(val, __longlong_as_double(assumed)))); } while (assumed != old || __longlong_as_double(old) < val); } __device__ __forceinline__ static void reduceAdd(float *address, float val) { atomicAdd(address, val); } __device__ __forceinline__ static void reduceAdd(double *address, double val) { atomicAdd(address, val); } template __global__ void feats_reduce_kernel( const T *feats, const int32_t *coors_map, T *reduced_feats, // shall be 0 at initialization const int num_input, const int num_feats, const reduce_t reduce_type) { MUSA_1D_KERNEL_LOOP(x, num_input) { int32_t reduce_to = coors_map[x]; if (reduce_to == -1) continue; const T *feats_offset = feats + x * num_feats; T *reduced_feats_offset = reduced_feats + reduce_to * num_feats; if (reduce_type == reduce_t::MAX) { for (int i = 0; i < num_feats; i++) { reduceMax(&reduced_feats_offset[i], feats_offset[i]); } } else { for (int i = 0; i < num_feats; i++) { reduceAdd(&reduced_feats_offset[i], feats_offset[i]); } } } } template __global__ void add_reduce_traceback_grad_kernel( T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map, const int32_t *reduce_count, const int num_input, const int num_feats, const reduce_t reduce_type) { MUSA_1D_KERNEL_LOOP(x, num_input) { int32_t reduce_to = coors_map[x]; if (reduce_to == -1) { continue; } const int input_offset = x * num_feats; T *grad_feats_offset = grad_feats + input_offset; const int reduced_offset = reduce_to * num_feats; const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset; if (reduce_type == reduce_t::SUM) { for (int i = 0; i < num_feats; i++) { grad_feats_offset[i] = grad_reduced_feats_offset[i]; } } else if (reduce_type == reduce_t::MEAN) { for (int i = 0; i < num_feats; i++) { grad_feats_offset[i] = grad_reduced_feats_offset[i] / static_cast(reduce_count[reduce_to]); } } } } template __global__ void max_reduce_traceback_scatter_idx_kernel( const T *feats, const T *reduced_feats, int32_t *reduce_from, const int32_t *coors_map, const int num_input, const int num_feats) { MUSA_1D_KERNEL_LOOP(x, num_input) { int32_t reduce_to = coors_map[x]; const int input_offset = x * num_feats; const T *feats_offset = feats + input_offset; if (reduce_to == -1) { continue; } const int reduced_offset = reduce_to * num_feats; const T *reduced_feats_offset = reduced_feats + reduced_offset; int32_t *reduce_from_offset = reduce_from + reduced_offset; for (int i = 0; i < num_feats; i++) { if (feats_offset[i] == reduced_feats_offset[i]) { atomicMin(&reduce_from_offset[i], static_cast(x)); } } } } template __global__ void max_reduce_scatter_grad_kernel(T *grad_feats, const T *grad_reduced_feats, const int32_t *reduce_from, const int num_reduced, const int num_feats) { MUSA_1D_KERNEL_LOOP(x, num_reduced) { const int reduced_offset = x * num_feats; const int32_t *scatter_to_offset = reduce_from + reduced_offset; const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset; for (int i = 0; i < num_feats; i++) { grad_feats[scatter_to_offset[i] * num_feats + i] = grad_reduced_feats_offset[i]; } } } #endif // SCATTER_POINTS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/sigmoid_focal_loss_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH #define SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void sigmoid_focal_loss_forward_musa_kernel( const int nthreads, const T* input, const int64_t* target, const T* weight, T* output, const T gamma, const T alpha, const int num_classes) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int n = index / num_classes; int c = index % num_classes; int64_t t = target[n]; T flag_p = (t == c); T flag_n = (t != c); // p = sigmoid(x) = 1. / 1. + expf(-x) T p = (T)1. / ((T)1. + expf(-input[index])); // (1 - p)**gamma * log(p) T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN)); // p**gamma * log(1 - p) T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN)); output[index] = (T)0.; output[index] += -flag_p * alpha * term_p; output[index] += -flag_n * ((T)1. - alpha) * term_n; if (weight != NULL) { output[index] *= weight[t]; } } } template __global__ void sigmoid_focal_loss_backward_musa_kernel( const int nthreads, const T* input, const int64_t* target, const T* weight, T* grad_input, const T gamma, const T alpha, const int num_classes) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int n = index / num_classes; int c = index % num_classes; int64_t t = target[n]; T flag_p = (t == c); T flag_n = (t != c); // p = sigmoid(x) = 1. / 1. + expf(-x) T p = (T)1. / ((T)1. + exp(-input[index])); // (1 - p)**gamma * (1 - p - gamma*p*log(p)) T term_p = pow(((T)1. - p), gamma) * ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN)))); // p**gamma * (gamma * (1 - p) * log(1 - p) - p) T term_n = pow(p, gamma) * (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p); grad_input[index] = (T)0.; grad_input[index] += -flag_p * alpha * term_p; grad_input[index] += -flag_n * ((T)1. - alpha) * term_n; if (weight != NULL) { grad_input[index] *= weight[t]; } } } #endif // SIGMOID_FOCAL_LOSS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/softmax_focal_loss_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH #define SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void softmax_focal_loss_forward_musa_kernel( const int nthreads, const T* softmax, const int64_t* target, const T* weight, T* output, const T gamma, const T alpha, const int num_classes) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int64_t label = target[index]; T pred = softmax[index * num_classes + label]; if (label >= 0) { output[index] = -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN)); } else { output[index] = 0; } if (weight != NULL) { output[index] *= weight[label]; } } } template __global__ void softmax_focal_loss_backward_musa1_kernel( const int nthreads, const T* softmax, const int64_t* target, const T* weight, T* buff, const T gamma, const T alpha, const int num_classes) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int64_t label = target[index]; T pred = softmax[index * num_classes + label]; if (label >= 0) { buff[index] = alpha * (-pow((T)1. - pred, gamma) + gamma * pow((T)1. - pred, gamma - 1) * pred * log(max(pred, (T)FLT_MIN))); } else { buff[index] = 0; } if (weight != NULL) { buff[index] *= weight[label]; } } } template __global__ void softmax_focal_loss_backward_musa2_kernel( const int nthreads, const T* softmax, const int64_t* target, const T* buff, T* grad_input, const int num_classes) { MUSA_1D_KERNEL_LOOP(index, nthreads) { int n = index / num_classes; int c = index % num_classes; int64_t label = target[n]; if (label >= 0) { T flag = (label == c ? (T)1. : (T)0.); grad_input[index] = buff[n] * (flag - softmax[index]); } else { grad_input[index] = 0; } } } #endif // SOFTMAX_FOCAL_LOSS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/spconv/indice.muh ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef INDICE_MU_H_ #define INDICE_MU_H_ #include #include #include template __global__ void prepareIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; auto indicePairsDim2 = indicePairs.dim(2); Index index; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPos( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 0, oldNum) = ix; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); indicePairs(offset, 1, oldNum) = index; indicePairUnique[offset * indicePairsDim2 + oldNum] = index; } } } template __global__ void prepareDeConvIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; auto indicePairsDim2 = indicePairs.dim(2); Index index; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPosTranspose( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 0, oldNum) = ix; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); indicePairs(offset, 1, oldNum) = index; indicePairUnique[offset * indicePairsDim2 + oldNum] = index; } } } template __global__ void assignGridAndIndiceOutKernel( tv::TensorView indicesOut, tv::TensorView gridsOut, int numAct, tv::TensorView indicePairs, tv::TensorView indicePairUnique, const tv::SimpleVector outSpatialShape, int batchSize) { Index index; auto indicesOutPtr = indicesOut.data(); for (int ix : tv::KernelLoopX(numAct)) { index = indicePairUnique[ix]; gridsOut[index] = ix; index = tv::rowArrayIdxInv( index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data()); indicesOut[ix * (NDim + 1)] = index % batchSize; } } template __global__ void assignIndicePairsKernel( tv::TensorView indicesOut, tv::TensorView gridsOut, int numActIn, tv::TensorView indicePairs, tv::TensorView indicePairUnique, const tv::SimpleVector outSpatialShape) { Index index; int kernelVolume = indicePairs.dim(0); for (int ix : tv::KernelLoopX(numActIn)) { for (int i = 0; i < kernelVolume; ++i) { index = indicePairs(i, 1, ix); if (index > -1) { indicePairs(i, 1, ix) = gridsOut[index]; } } } } template __global__ void prepareSubMGridKernel( tv::TensorView indicesIn, tv::TensorView gridsOut, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index index = 0; for (int ix : tv::KernelLoopX(numActIn)) { index = tv::rowArrayIdx(indicesIn.data() + ix * (NDim + 1) + 1, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); gridsOut[index] = ix; } } template __global__ void getSubMIndicePairsKernel( tv::TensorView indicesIn, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index numValidPoints = 0; Index validPoints[KernelMaxVolume * (NDim + 1)]; Index *pointPtr = nullptr; Index index = 0; for (int ix : tv::KernelLoopX(numActIn)) { numValidPoints = getValidOutPos( indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), validPoints); for (int i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + spatialVolume * indicesIn(ix, 0); if (gridsOut[index] > -1) { auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); indicePairs(offset, 1, oldNum) = gridsOut[index]; indicePairs(offset, 0, oldNum) = ix; } } } } template __global__ void resetGridKernel(const Index *indicePairUnique, tv::TensorView gridsOut, int numAct) { for (int ix : tv::KernelLoopX(numAct)) { gridsOut[indicePairUnique[ix]] = -1; } } template __global__ void resetGridSubMKernel( const Index *indices, tv::TensorView gridsOut, const tv::SimpleVector outSpatialShape, int numAct) { int outSpatialShapeReg[NDim]; for (int i = 0; i < NDim; ++i) { outSpatialShapeReg[i] = outSpatialShape[i]; } Index spatialVolume = 1; auto indsPtr = indices; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index index; for (int ix : tv::KernelLoopX(numAct)) { indsPtr = indices + ix * (NDim + 1); index = tv::rowArrayIdx(indsPtr + 1, outSpatialShapeReg); gridsOut[index + spatialVolume * indsPtr[0]] = -1; } } #endif ================================================ FILE: mmcv/ops/csrc/common/musa/spconv/reordering.muh ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef REORDERING_MU_H_ #define REORDERING_MU_H_ #include template __global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features, const Index *indices, int size, int numPlanes) { int ILPStrideX[NumILP]; Index inds[NumILP]; #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; for (int ix : tv::KernelLoopX(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) { if (ix + ILPStrideX[ilp] < size) inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; } for (int iy : tv::KernelLoopY(numPlanes)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { if (ix + ILPStrideX[ilp] < size) buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = features[inds[ilp] + iy]; } } } } template __global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features, const Index *indices, int size, int numPlanes) { int ILPStrideX[NumILP]; Index inds[NumILP]; #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; for (int ix : tv::KernelLoopX(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) { if (ix + ILPStrideX[ilp] < size) inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; } for (int iy : tv::KernelLoopY(numPlanes)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { if (ix + ILPStrideX[ilp] < size) reinterpret_cast( buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] = reinterpret_cast(features)[inds[ilp] + iy]; } } } } template __global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features, const Index *indices, int size, int numPlanes) { int ILPStrideY[NumILP]; #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y; features += blockIdx.x * NumTLP; buffer += blockIdx.x * NumTLP; for (int iy : tv::KernelLoopY(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { reinterpret_cast( buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] = reinterpret_cast( features)[indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x]; } } } template __global__ void scatterAddGenericKernel(scalar_t *outFeatures, const scalar_t *buffer, const Index *indices, int size, int numPlanes) { int ILPStrideX[NumILP]; Index inds[NumILP]; #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; for (int ix : tv::KernelLoopX(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) { if (ix + ILPStrideX[ilp] < size) inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; } for (int iy : tv::KernelLoopY(numPlanes)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { if (ix + ILPStrideX[ilp] < size) { outFeatures[inds[ilp] + iy] += buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]; } } } } } template __global__ void scatterAddVecBlockKernel(scalar_t *outFeatures, const scalar_t *buffer, const Index *indices, int size, int numPlanes) { int ILPStrideY[NumILP]; constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t); #pragma unroll for (int ilp = 0; ilp < NumILP; ilp++) ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y; outFeatures += blockIdx.x * NumTLP; buffer += blockIdx.x * NumTLP; scalar_t buf[vecloadFactor]; scalar_t buf2[vecloadFactor]; Index idx; for (int iy : tv::KernelLoopY(size)) { #pragma unroll for (int ilp = 0; ilp < NumILP; ++ilp) { idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x; reinterpret_cast(buf)[0] = reinterpret_cast(outFeatures)[idx]; reinterpret_cast(buf2)[0] = reinterpret_cast( buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x]; #pragma unroll for (int i = 0; i < vecloadFactor; i++) { buf[i] += buf2[i]; } reinterpret_cast(outFeatures)[idx] = reinterpret_cast(buf)[0]; } } } #endif ================================================ FILE: mmcv/ops/csrc/common/musa/stack_ball_query_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu #ifndef STACK_BALL_QUERY_MUSA_KERNEL_MUH #define STACK_BALL_QUERY_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void stack_ball_query_forward_musa_kernel( int B, int M, float radius, int nsample, const T *new_xyz, const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt, int *idx) { // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features // :param xyz_batch_cnt: (batch_size), [N1, N2, ...] // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...] // output: // idx: (M, nsample) const T *cur_xyz = xyz; int *cur_idx = idx; MUSA_1D_KERNEL_LOOP(pt_idx, M) { int bs_idx = 0; for (int pt_cnt = 0; bs_idx < B; bs_idx++) { pt_cnt += new_xyz_batch_cnt[bs_idx]; if (pt_idx < pt_cnt) break; } int xyz_batch_start_idx = 0; for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k]; const T *new_xyz_p = new_xyz + pt_idx * 3; cur_xyz += xyz_batch_start_idx * 3; cur_idx += pt_idx * nsample; float radius2 = radius * radius; T new_x = new_xyz_p[0]; T new_y = new_xyz_p[1]; T new_z = new_xyz_p[2]; int n = xyz_batch_cnt[bs_idx]; int cnt = 0; for (int k = 0; k < n; ++k) { T x = cur_xyz[k * 3 + 0]; T y = cur_xyz[k * 3 + 1]; T z = cur_xyz[k * 3 + 2]; T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + (new_z - z) * (new_z - z); if (d2 < radius2) { if (cnt == 0) { for (int l = 0; l < nsample; ++l) { cur_idx[l] = k; } } cur_idx[cnt] = k; ++cnt; if (cnt >= nsample) break; } } if (cnt == 0) cur_idx[0] = -1; } } #endif // STACK_BALL_QUERY_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/stack_group_points_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved. // Modified from // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu #ifndef STACK_GROUP_POINTS_MUSA_KERNEL_MUH #define STACK_GROUP_POINTS_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" #include template __global__ void stack_group_points_forward_musa_kernel( int b, int c, int m, int nsample, const T *features, const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt, T *out) { // :param features: (N1 + N2 ..., C) tensor of features to group // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor // containing the indices of features to group with :param idx_batch_cnt: // (batch_size) [M1 + M2 ...] tensor containing the indices of features to // group with :return: // output: (M1 + M2, C, nsample) tensor MUSA_1D_KERNEL_LOOP(index, m * c * nsample) { const T *cur_features = features; const int *cur_idx = idx; int sample_idx = index % nsample; int c_idx = (index / nsample) % c; int pt_idx = (index / nsample / c); if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return; int bs_idx = 0, pt_cnt = idx_batch_cnt[0]; for (int k = 1; k < b; k++) { if (pt_idx < pt_cnt) break; pt_cnt += idx_batch_cnt[k]; bs_idx = k; } int features_batch_start_idx = 0; int features_batch_end_idx = features_batch_cnt[0]; for (int k = 0; k < bs_idx; k++) { features_batch_start_idx += features_batch_cnt[k]; features_batch_end_idx = features_batch_start_idx + features_batch_cnt[k + 1]; } cur_features += features_batch_start_idx * c; cur_idx += pt_idx * nsample + sample_idx; int in_idx = cur_idx[0] * c + c_idx; int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx; if (in_idx < features_batch_end_idx * c) { out[out_idx] = cur_features[in_idx]; } } } template __global__ void stack_group_points_backward_musa_kernel( int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx, const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) { // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing // the indices of features to group with :param idx_batch_cnt: (batch_size) // [M1 + M2 ...] tensor containing the indices of features to group with // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the // indices of features to group with :return: // grad_features: (N1 + N2 ..., C) gradient of the features MUSA_1D_KERNEL_LOOP(index, m * c * nsample) { const T *cur_grad_out = grad_out; const int *cur_idx = idx; T *cur_grad_features = grad_features; int sample_idx = index % nsample; int c_idx = (index / nsample) % c; int pt_idx = (index / nsample / c); if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return; int bs_idx = 0, pt_cnt = idx_batch_cnt[0]; for (int k = 1; k < b; k++) { if (pt_idx < pt_cnt) break; pt_cnt += idx_batch_cnt[k]; bs_idx = k; } int features_batch_start_idx = 0; for (int k = 0; k < bs_idx; k++) features_batch_start_idx += features_batch_cnt[k]; cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx; cur_idx += pt_idx * nsample + sample_idx; cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx; atomicAdd(cur_grad_features, cur_grad_out[0]); } } #endif // GROUP_POINTS_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/sync_bn_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef SYNCBN_MUSA_KERNEL_MUH #define SYNCBN_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void sync_bn_forward_mean_musa_kernel(const T *input, float *mean, int num, int channels, int spatial) { __shared__ float buffer[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; buffer[tid] += input[index]; } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer[tid] += buffer[tid + s]; } __syncthreads(); } int total = num * spatial; if (tid == 0) { mean[c] = buffer[0] / total; } } template <> __global__ void sync_bn_forward_mean_musa_kernel(const phalf *input, float *mean, int num, int channels, int spatial) { __shared__ float buffer[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; buffer[tid] += static_cast(input[index]); } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer[tid] += buffer[tid + s]; } __syncthreads(); } int total = num * spatial; if (tid == 0) { mean[c] = buffer[0] / total; } } template __global__ void sync_bn_forward_var_musa_kernel(const T *input, const float *mean, float *var, int num, int channels, int spatial) { __shared__ float buffer[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; float td = input[index] - mean[c]; buffer[tid] += td * td; } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer[tid] += buffer[tid + s]; } __syncthreads(); } int total = num * spatial; if (tid == 0) { var[c] = buffer[0] / total; } } template <> __global__ void sync_bn_forward_var_musa_kernel(const phalf *input, const float *mean, float *var, int num, int channels, int spatial) { __shared__ float buffer[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; float td = static_cast(input[index]) - mean[c]; buffer[tid] += td * td; } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer[tid] += buffer[tid + s]; } __syncthreads(); } int total = num * spatial; if (tid == 0) { var[c] = buffer[0] / total; } } template __global__ void sync_bn_forward_output_musa_kernel( const T *input, const float *mean, const float *var, float *running_mean, float *running_var, const float *weight, const float *bias, float *norm, float *std, T *output, int num, int channels, int spatial, float eps, float momentum, int group_size) { int tid = threadIdx.x; int c = blockIdx.x; float mean_value = mean[c]; float std_value = sqrt(var[c] + eps); if (weight != nullptr) { float weight_value = weight[c]; float bias_value = bias[c]; if (norm != nullptr) { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; norm[index] = (input[index] - mean_value) / std_value; output[index] = norm[index] * weight_value + bias_value; } } else { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = (input[index] - mean_value) / std_value * weight_value + bias_value; } } } else { if (norm != nullptr) { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = norm[index] = (input[index] - mean_value) / std_value; } } else { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = (input[index] - mean_value) / std_value; } } } if (tid == 0) { if (std != nullptr) std[c] = std_value; if (running_mean != nullptr) { running_mean[c] = momentum * mean_value + (1 - momentum) * running_mean[c]; int count = num * spatial * group_size; float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c]; running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c]; } } } template <> __global__ void sync_bn_forward_output_musa_kernel( const phalf *input, const float *mean, const float *var, float *running_mean, float *running_var, const float *weight, const float *bias, float *norm, float *std, phalf *output, int num, int channels, int spatial, float eps, float momentum, int group_size) { int tid = threadIdx.x; int c = blockIdx.x; float mean_value = mean[c]; float std_value = sqrt(var[c] + eps); if (weight != nullptr) { float weight_value = weight[c]; float bias_value = bias[c]; if (norm != nullptr) { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; norm[index] = (static_cast(input[index]) - mean_value) / std_value; output[index] = static_cast(norm[index] * weight_value + bias_value); } } else { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = static_cast((static_cast(input[index]) - mean_value) / std_value * weight_value + bias_value); } } } else { if (norm != nullptr) { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; norm[index] = (static_cast(input[index]) - mean_value) / std_value; output[index] = static_cast(norm[index]); } } else { for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; output[index] = static_cast( (static_cast(input[index]) - mean_value) / std_value); } } } if (tid == 0) { if (std != nullptr) std[c] = std_value; if (running_mean != nullptr) { running_mean[c] = momentum * mean_value + (1 - momentum) * running_mean[c]; int count = num * spatial * group_size; float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c]; running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c]; } } } template __global__ void sync_bn_backward_param_musa_kernel(const T *grad_output, const float *norm, float *grad_weight, float *grad_bias, int num, int channels, int spatial) { __shared__ float buffer1[THREADS_PER_BLOCK]; __shared__ float buffer2[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer1[tid] = buffer2[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; buffer1[tid] += grad_output[index] * norm[index]; buffer2[tid] += grad_output[index]; } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer1[tid] += buffer1[tid + s]; buffer2[tid] += buffer2[tid + s]; } __syncthreads(); } if (tid == 0) { grad_weight[c] = buffer1[0]; grad_bias[c] = buffer2[0]; } } template <> __global__ void sync_bn_backward_param_musa_kernel(const phalf *grad_output, const float *norm, float *grad_weight, float *grad_bias, int num, int channels, int spatial) { __shared__ float buffer1[THREADS_PER_BLOCK]; __shared__ float buffer2[THREADS_PER_BLOCK]; int tid = threadIdx.x; int c = blockIdx.x; buffer1[tid] = buffer2[tid] = 0; for (int i = tid; i < num * spatial; i += blockDim.x) { int index = (i / spatial) * channels * spatial + c * spatial + i % spatial; buffer1[tid] += static_cast(grad_output[index]) * norm[index]; buffer2[tid] += static_cast(grad_output[index]); } __syncthreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { buffer1[tid] += buffer1[tid + s]; buffer2[tid] += buffer2[tid + s]; } __syncthreads(); } if (tid == 0) { grad_weight[c] = buffer1[0]; grad_bias[c] = buffer2[0]; } } template __global__ void sync_bn_backward_data_musa_kernel( int output_size, const T *grad_output, const float *weight, const float *grad_weight, const float *grad_bias, const float *norm, const float *std, T *grad_input, int num, int channels, int spatial) { int factor = num * spatial; MUSA_1D_KERNEL_LOOP(index, output_size) { int c = (index / spatial) % channels; grad_input[index] = weight[c] * (grad_output[index] - (grad_weight[c] * norm[index] + grad_bias[c]) / factor) / std[c]; } } template <> __global__ void sync_bn_backward_data_musa_kernel( int output_size, const phalf *grad_output, const float *weight, const float *grad_weight, const float *grad_bias, const float *norm, const float *std, phalf *grad_input, int num, int channels, int spatial) { int factor = num * spatial; MUSA_1D_KERNEL_LOOP(index, output_size) { int c = (index / spatial) % channels; grad_input[index] = static_cast( weight[c] * (static_cast(grad_output[index]) - (grad_weight[c] * norm[index] + grad_bias[c]) / factor) / std[c]); } } #endif // SYNCBN_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/three_interpolate_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef THREE_INTERPOLATE_MUSA_KERNEL_MUH #define THREE_INTERPOLATE_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void three_interpolate_forward_musa_kernel( int b, int c, int m, int n, const T *points, const int *__restrict__ idx, const T *weight, T *out) { // points: (B, C, M) // idx: (B, N, 3) // weight: (B, N, 3) // output: // out: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, n) { if (bs_idx >= b || c_idx >= c) return; weight += bs_idx * n * 3 + pt_idx * 3; points += bs_idx * c * m + c_idx * m; idx += bs_idx * n * 3 + pt_idx * 3; out += bs_idx * c * n + c_idx * n; out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + weight[2] * points[idx[2]]; } } template __global__ void three_interpolate_backward_musa_kernel( int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx, const T *weight, T *grad_points) { // grad_out: (B, C, N) // weight: (B, N, 3) // output: // grad_points: (B, C, M) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, n) { if (bs_idx >= b || c_idx >= c) return; grad_out += bs_idx * c * n + c_idx * n + pt_idx; weight += bs_idx * n * 3 + pt_idx * 3; grad_points += bs_idx * c * m + c_idx * m; idx += bs_idx * n * 3 + pt_idx * 3; atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); } } #endif // THREE_INTERPOLATE_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/three_nn_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef THREE_NN_MUSA_KERNEL_MUH #define THREE_NN_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void three_nn_forward_musa_kernel(int b, int n, int m, const T *unknown, const T *known, T *dist2, int *__restrict__ idx) { // unknown: (B, N, 3) // known: (B, M, 3) // output: // dist2: (B, N, 3) // idx: (B, N, 3) int bs_idx = blockIdx.y; MUSA_1D_KERNEL_LOOP(pt_idx, n) { if (bs_idx >= b) return; unknown += bs_idx * n * 3 + pt_idx * 3; known += bs_idx * m * 3; dist2 += bs_idx * n * 3 + pt_idx * 3; idx += bs_idx * n * 3 + pt_idx * 3; T ux = unknown[0]; T uy = unknown[1]; T uz = unknown[2]; double best1 = 1e40, best2 = 1e40, best3 = 1e40; int besti1 = 0, besti2 = 0, besti3 = 0; for (int k = 0; k < m; ++k) { T x = known[k * 3 + 0]; T y = known[k * 3 + 1]; T z = known[k * 3 + 2]; T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); if (d < best1) { best3 = best2; besti3 = besti2; best2 = best1; besti2 = besti1; best1 = d; besti1 = k; } else if (d < best2) { best3 = best2; besti3 = besti2; best2 = d; besti2 = k; } else if (d < best3) { best3 = d; besti3 = k; } } dist2[0] = best1; dist2[1] = best2; dist2[2] = best3; idx[0] = besti1; idx[1] = besti2; idx[2] = besti3; } } #endif // THREE_NN_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/tin_shift_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved #ifndef TIN_SHIFT_MUSA_KERNEL_MUH #define TIN_SHIFT_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" template __global__ void tin_shift_forward_musa_kernel( const int nthreads, const T* input, const int* shift, T* output, const int batch_size, const int channels, const int t_size, const int hw_size, const int group_size, const int group_channel) { MUSA_1D_KERNEL_LOOP(index, nthreads) { const int hw_index = index % hw_size; const int j = (index / hw_size) % channels; const int n_index = (index / hw_size / channels) % batch_size; int group_id = j / group_channel; int t_shift = shift[n_index * group_size + group_id]; int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index; for (int i = 0; i < t_size; i++) { int now_t = i + t_shift; int data_id = i * hw_size * channels + offset; if (now_t < 0 || now_t >= t_size) { continue; } int out_id = now_t * hw_size * channels + offset; output[out_id] = input[data_id]; } } } template __global__ void tin_shift_backward_musa_kernel( const int nthreads, const T* input, const int* shift, T* output, const int batch_size, const int channels, const int t_size, const int hw_size, const int group_size, const int group_channel) { MUSA_1D_KERNEL_LOOP(index, nthreads) { const int hw_index = index % hw_size; const int j = (index / hw_size) % channels; const int n_index = (index / hw_size / channels) % batch_size; int group_id = j / group_channel; int t_shift = shift[n_index * group_size + group_id]; int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index; for (int i = 0; i < t_size; i++) { int now_t = i + t_shift; int data_id = i * hw_size * channels + offset; if (now_t < 0 || now_t >= t_size) { continue; } int out_id = now_t * hw_size * channels + offset; output[out_id] = input[data_id]; } } } #endif // TIN_SHIFT_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/musa/voxelization_musa_kernel.muh ================================================ // Copyright (c) OpenMMLab. All rights reserved. #ifndef VOXELIZATION_MUSA_KERNEL_MUH #define VOXELIZATION_MUSA_KERNEL_MUH #include "pytorch_musa_helper.hpp" typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t; template __global__ void dynamic_voxelize_kernel( const T* points, T_int* coors, const float voxel_x, const float voxel_y, const float voxel_z, const float coors_x_min, const float coors_y_min, const float coors_z_min, const float coors_x_max, const float coors_y_max, const float coors_z_max, const int grid_x, const int grid_y, const int grid_z, const int num_points, const int num_features, const int NDim) { // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; MUSA_1D_KERNEL_LOOP(index, num_points) { // To save some computation auto points_offset = points + index * num_features; auto coors_offset = coors + index * NDim; int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x); if (c_x < 0 || c_x >= grid_x) { coors_offset[0] = -1; continue; } int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y); if (c_y < 0 || c_y >= grid_y) { coors_offset[0] = -1; coors_offset[1] = -1; continue; } int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z); if (c_z < 0 || c_z >= grid_z) { coors_offset[0] = -1; coors_offset[1] = -1; coors_offset[2] = -1; } else { coors_offset[0] = c_z; coors_offset[1] = c_y; coors_offset[2] = c_x; } } } template __global__ void assign_point_to_voxel(const int nthreads, const T* points, T_int* point_to_voxelidx, T_int* coor_to_voxelidx, T* voxels, const int max_points, const int num_features, const int num_points, const int NDim) { MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) { // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; int index = thread_idx / num_features; int num = point_to_voxelidx[index]; int voxelidx = coor_to_voxelidx[index]; if (num > -1 && voxelidx > -1) { auto voxels_offset = voxels + voxelidx * max_points * num_features + num * num_features; int k = thread_idx % num_features; voxels_offset[k] = points[thread_idx]; } } } template __global__ void assign_voxel_coors(const int nthreads, T_int* coor, T_int* point_to_voxelidx, T_int* coor_to_voxelidx, T_int* voxel_coors, const int num_points, const int NDim) { MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) { // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; // if (index >= num_points) return; int index = thread_idx / NDim; int num = point_to_voxelidx[index]; int voxelidx = coor_to_voxelidx[index]; if (num == 0 && voxelidx > -1) { auto coors_offset = voxel_coors + voxelidx * NDim; int k = thread_idx % NDim; coors_offset[k] = coor[thread_idx]; } } } template __global__ void point_to_voxelidx_kernel(const T_int* coor, T_int* point_to_voxelidx, T_int* point_to_pointidx, const int max_points, const int max_voxels, const int num_points, const int NDim) { MUSA_1D_KERNEL_LOOP(index, num_points) { auto coor_offset = coor + index * NDim; // skip invalid points if (coor_offset[0] == -1) continue; int num = 0; int coor_x = coor_offset[0]; int coor_y = coor_offset[1]; int coor_z = coor_offset[2]; // only calculate the coors before this coor[index] for (int i = 0; i < index; ++i) { auto prev_coor = coor + i * NDim; if (prev_coor[0] == -1) continue; // Find all previous points that have the same coors // if find the same coor, record it if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) && (prev_coor[2] == coor_z)) { num++; if (num == 1) { // point to the same coor that first show up point_to_pointidx[index] = i; } else if (num >= max_points) { // out of boundary break; } } } if (num == 0) { point_to_pointidx[index] = index; } if (num < max_points) { point_to_voxelidx[index] = num; } } } template __global__ void determin_voxel_num( // const T_int* coor, T_int* num_points_per_voxel, T_int* point_to_voxelidx, T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num, const int max_points, const int max_voxels, const int num_points) { // only calculate the coors before this coor[index] for (int i = 0; i < num_points; ++i) { int point_pos_in_voxel = point_to_voxelidx[i]; // record voxel if (point_pos_in_voxel == -1) { // out of max_points or invalid point continue; } else if (point_pos_in_voxel == 0) { // record new voxel int voxelidx = voxel_num[0]; if (voxel_num[0] >= max_voxels) continue; voxel_num[0] += 1; coor_to_voxelidx[i] = voxelidx; num_points_per_voxel[voxelidx] = 1; } else { int point_idx = point_to_pointidx[i]; int voxelidx = coor_to_voxelidx[point_idx]; if (voxelidx != -1) { coor_to_voxelidx[i] = voxelidx; num_points_per_voxel[voxelidx] += 1; } } } } __global__ void nondeterministic_get_assign_pos( const int nthreads, const int32_t* coors_map, int32_t* pts_id, int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) { MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) { int coors_idx = coors_map[thread_idx]; if (coors_idx > -1) { int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1); pts_id[thread_idx] = coors_pts_pos; if (coors_pts_pos == 0) { coors_order[coors_idx] = atomicAdd(coors_count, 1); } } } } template __global__ void nondeterministic_assign_point_voxel( const int nthreads, const T* points, const int32_t* coors_map, const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count, const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count, const int max_voxels, const int max_points, const int num_features, const int NDim) { MUSA_1D_KERNEL_LOOP(thread_idx, nthreads) { int coors_idx = coors_map[thread_idx]; int coors_pts_pos = pts_id[thread_idx]; if (coors_idx > -1 && coors_pts_pos < max_points) { int coors_pos = coors_order[coors_idx]; if (coors_pos < max_voxels) { auto voxels_offset = voxels + (coors_pos * max_points + coors_pts_pos) * num_features; auto points_offset = points + thread_idx * num_features; for (int k = 0; k < num_features; k++) { voxels_offset[k] = points_offset[k]; } if (coors_pts_pos == 0) { pts_count[coors_pos] = min(reduce_count[coors_idx], max_points); auto coors_offset = coors + coors_pos * NDim; auto coors_in_offset = coors_in + coors_idx * NDim; for (int k = 0; k < NDim; k++) { coors_offset[k] = coors_in_offset[k]; } } } } } } #endif // VOXELIZATION_MUSA_KERNEL_MUH ================================================ FILE: mmcv/ops/csrc/common/parrots_cpp_helper.hpp ================================================ #ifndef PARROTS_CPP_HELPER #define PARROTS_CPP_HELPER #include #include #include #include #include using namespace parrots; #define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \ case prim_type: { \ using scalar_t = type; \ return __VA_ARGS__(); \ } #define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...) \ [&] { \ const auto& the_type = TYPE; \ switch (the_type) { \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \ default: \ PARROTS_NOTSUPPORTED; \ } \ }() #define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...) \ [&] { \ const auto& the_type = TYPE; \ switch (the_type) { \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \ default: \ PARROTS_NOTSUPPORTED; \ } \ }() #endif // PARROTS_CPP_HELPER ================================================ FILE: mmcv/ops/csrc/common/parrots_cuda_helper.hpp ================================================ #ifndef PARROTS_CUDA_HELPER #define PARROTS_CUDA_HELPER #include #include #include #include #include #include #include #include #include #include "common_cuda_helper.hpp" #include "parrots_cudawarpfunction.cuh" using namespace parrots; using phalf = float16; #define __PHALF(x) (x.y) #define PARROTS_CUDA_CHECK(exp) \ do { \ cudaError_t err = exp; \ if (err != cudaSuccess) { \ fprintf(stderr, "cudaCheckError() failed : %s\n", \ cudaGetErrorString(err)); \ exit(-1); \ } \ } while (0) #define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \ case prim_type: { \ using scalar_t = type; \ return __VA_ARGS__(); \ } #define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...) \ [&] { \ const auto& the_type = TYPE; \ switch (the_type) { \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \ default: \ PARROTS_NOTSUPPORTED; \ } \ }() #define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...) \ [&] { \ const auto& the_type = TYPE; \ switch (the_type) { \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__) \ PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \ default: \ PARROTS_NOTSUPPORTED; \ } \ }() /** atomicAdd **/ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 static __inline__ __device__ double atomicAdd(double* address, double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; if (val == 0.0) return __longlong_as_double(old); do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } #endif static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) { unsigned int* aligned = (unsigned int*)((size_t)address - ((size_t)address & 2)); unsigned int old = *aligned; unsigned int assumed; unsigned short old_as_us; do { assumed = old; old_as_us = (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff); #if __CUDACC_VER_MAJOR__ >= 9 float16 tmp; tmp.x = old_as_us; float16 sum = tmp + val; unsigned short sum_as_us = sum.x; // half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us)) // + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum); #else unsigned short sum_as_us = __float2half_rn(__half2float(old_as_us) + (float)(val)); #endif unsigned int sum_as_ui = (size_t)address & 2 ? (sum_as_us << 16) | (old & 0xffff) : (old & 0xffff0000) | sum_as_us; old = atomicCAS(aligned, assumed, sum_as_ui); } while (assumed != old); //__half_raw raw = {old_as_us}; // return float16(raw); return *reinterpret_cast(&old_as_us); } #endif // PARROTS_CUDA_HELPER ================================================ FILE: mmcv/ops/csrc/common/pytorch_cpp_helper.hpp ================================================ #ifndef PYTORCH_CPP_HELPER #define PYTORCH_CPP_HELPER #include #include using namespace at; #define CHECK_CUDA(x) \ TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") #define CHECK_MLU(x) \ TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor") #define CHECK_MUSA(x) \ TORCH_CHECK(x.device().is_privateuseone(), #x " must be a MUSA tensor") #define CHECK_CPU(x) \ TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_CUDA_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) #define CHECK_MLU_INPUT(x) \ CHECK_MLU(x); \ CHECK_CONTIGUOUS(x) #define CHECK_MUSA_INPUT(x) \ CHECK_MUSA(x); \ CHECK_CONTIGUOUS(x) #define CHECK_CPU_INPUT(x) \ CHECK_CPU(x); \ CHECK_CONTIGUOUS(x) #endif // PYTORCH_CPP_HELPER ================================================ FILE: mmcv/ops/csrc/common/pytorch_cuda_helper.hpp ================================================ #ifndef PYTORCH_CUDA_HELPER #define PYTORCH_CUDA_HELPER #include #ifdef MMCV_WITH_MUSA #include "common_musa_helper.hpp" #include "torch_musa/csrc/aten/musa/MUSAContext.h" #include "torch_musa/csrc/core/MUSAGuard.h" #include "torch_musa/share/generated_cuda_compatible/aten/src/THC/THCAtomics.muh" #include "torch_musa/share/generated_cuda_compatible/include/ATen/musa/MUSA_PORT_ApplyUtils.muh" #else #include #include #include #include #include "common_cuda_helper.hpp" #endif using at::Half; using at::Tensor; using phalf = at::Half; #define __PHALF(x) (x) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #endif // PYTORCH_CUDA_HELPER ================================================ FILE: mmcv/ops/csrc/common/pytorch_device_registry.hpp ================================================ #ifndef PYTORCH_DEVICE_REGISTRY_H #define PYTORCH_DEVICE_REGISTRY_H // Using is recommended in the official documentation in // https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op. // However, we use for compatibility with CUDA 9.0 // Read https://github.com/pytorch/extension-cpp/issues/35 for more details. #include #include #include #include #include #ifdef MMCV_WITH_MUSA #include "torch_musa/csrc/aten/utils/Utils.h" #endif inline std::string GetDeviceStr(const at::Device& device) { std::string str = DeviceTypeName(device.type(), true); if (device.has_index()) { str.push_back(':'); str.append(std::to_string(device.index())); } return str; } // Registry template class DeviceRegistry; template class DeviceRegistry { public: using FunctionType = Ret (*)(Args...); static const int MAX_DEVICE_TYPES = int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES); void Register(at::DeviceType device, FunctionType function) { funcs_[int8_t(device)] = function; } FunctionType Find(at::DeviceType device) const { return funcs_[int8_t(device)]; } static DeviceRegistry& instance() { static DeviceRegistry inst; return inst; } private: DeviceRegistry() { for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) { funcs_[i] = nullptr; } }; FunctionType funcs_[MAX_DEVICE_TYPES]; }; // get device of first tensor param template , at::Tensor>::value, bool> = true> at::Device GetFirstTensorDevice(T&& t, Args&&... args) { return std::forward(t).device(); } template , at::Tensor>::value, bool> = true> at::Device GetFirstTensorDevice(T&& t, Args&&... args) { return GetFirstTensorDevice(std::forward(args)...); } // check device consistency inline std::pair CheckDeviceConsistency( const at::Device& device, int index) { return {index, device}; } template , at::Tensor>::value, bool> = true> std::pair CheckDeviceConsistency(const at::Device& device, int index, T&& t, Args&&... args); template , at::Tensor>::value, bool> = true> std::pair CheckDeviceConsistency(const at::Device& device, int index, T&& t, Args&&... args) { auto new_device = std::forward(t).device(); if (new_device.type() != device.type() || new_device.index() != device.index()) { return {index, new_device}; } return CheckDeviceConsistency(device, index + 1, std::forward(args)...); } template < typename T, typename... Args, std::enable_if_t, at::Tensor>::value, bool>> std::pair CheckDeviceConsistency(const at::Device& device, int index, T&& t, Args&&... args) { return CheckDeviceConsistency(device, index + 1, std::forward(args)...); } // dispatch template auto Dispatch(const R& registry, const char* name, Args&&... args) { auto device = GetFirstTensorDevice(std::forward(args)...); auto inconsist = CheckDeviceConsistency(device, 0, std::forward(args)...); TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ", inconsist.first, ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(), " vs ", GetDeviceStr(device).c_str(), "\n") auto f_ptr = registry.Find(device.type()); TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ", GetDeviceStr(device).c_str(), " not found.\n") return f_ptr(std::forward(args)...); } // helper macro #define DEVICE_REGISTRY(key) DeviceRegistry::instance() #define REGISTER_DEVICE_IMPL(key, device, value) \ struct key##_##device##_registerer { \ key##_##device##_registerer() { \ DEVICE_REGISTRY(key).Register(at::k##device, value); \ } \ }; \ static key##_##device##_registerer _##key##_##device##_registerer; #define DISPATCH_DEVICE_IMPL(key, ...) \ Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__) #endif // PYTORCH_DEVICE_REGISTRY ================================================ FILE: mmcv/ops/csrc/common/pytorch_mlu_helper.hpp ================================================ /************************************************************************* * Copyright (C) 2021 Cambricon. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. *************************************************************************/ #ifndef PYTORCH_MLU_HELPER_HPP_ #define PYTORCH_MLU_HELPER_HPP_ #ifdef MMCV_WITH_MLU #include "aten.h" #define NFU_ALIGN_SIZE 128 #define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y)) #define PAD_DOWN(x, y) (((x) / (y)) * (y)) #define CEIL_DIV(x, y) (((x) + (y) - 1) / (y)) #define CEIL_ALIGN(x, y) (((x) + (y) - 1) / (y) * (y)) inline int32_t getJobLimitCapability() { CNcontext drv_ctx; TORCH_CHECK(CN_SUCCESS == cnCtxGetCurrent(&drv_ctx), "cnCtxGetCurrent fails"); CNctxConfigParam ctx_conf_param; TORCH_CHECK( CN_SUCCESS == cnGetCtxConfigParam(drv_ctx, CN_CTX_CONFIG_UNION_LIMIT, &ctx_conf_param), "cnGetCtxConfigParam fails."); return (int32_t)ctx_conf_param.unionLimit; } inline int32_t getCoreNumOfJobLimitCapability() { switch (getJobLimitCapability()) { default: return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * getJobLimitCapability(); case CN_KERNEL_CLASS_BLOCK: return 1; case CN_KERNEL_CLASS_UNION: return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); case CN_KERNEL_CLASS_UNION2: return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2; case CN_KERNEL_CLASS_UNION4: return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4; case CN_KERNEL_CLASS_UNION8: return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8; case CN_KERNEL_CLASS_UNION16: return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16; } } #endif // MMCV_WITH_MLU #endif // PYTORCH_MLU_HELPER_HPP_ ================================================ FILE: mmcv/ops/csrc/common/pytorch_musa_helper.hpp ================================================ #ifndef PYTORCH_MUSA_HELPER #define PYTORCH_MUSA_HELPER #include #include #include #include "common_musa_helper.hpp" #include "torch_musa/csrc/aten/musa/Exceptions.h" #include "torch_musa/csrc/aten/musa/MUSAContext.h" #include "torch_musa/csrc/core/MUSAGuard.h" using at::Half; using at::Tensor; using phalf = at::Half; #define __PHALF(x) (x) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #endif // PYTORCH_CUDA_HELPER ================================================ FILE: mmcv/ops/csrc/common/pytorch_npu_helper.hpp ================================================ /****************************************************************************** * Copyright (c) 2022 Huawei Technologies Co., Ltd * All rights reserved. * * Licensed under the BSD 3-Clause License (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://opensource.org/licenses/BSD-3-Clause * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef PYTORCH_NPU_HELPER_HPP_ #define PYTORCH_NPU_HELPER_HPP_ #include #include #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" #include "pytorch_npu_util.hpp" #define NPU_NAME_SPACE at_npu::native #ifdef MMCV_WITH_XLA #define REGISTER_NPU_IMPL(key, value) REGISTER_DEVICE_IMPL(key, XLA, value) #else #define REGISTER_NPU_IMPL(key, value) \ REGISTER_DEVICE_IMPL(key, PrivateUse1, value) #endif #ifdef MMCV_WITH_XLA #define CHECK_NPU(x) \ TORCH_CHECK(x.device().type() == at::kXLA, #x " must be a NPU tensor") #else #define CHECK_NPU(x) \ TORCH_CHECK(x.device().type() == at::kPrivateUse1, #x \ " must be a NPU " \ "tensor") #endif #endif // PYTORCH_NPU_HELPER_HPP_ ================================================ FILE: mmcv/ops/csrc/common/pytorch_npu_util.hpp ================================================ /****************************************************************************** * Copyright (c) 2022 Huawei Technologies Co., Ltd * All rights reserved. * * Licensed under the BSD 3-Clause License (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * https://opensource.org/licenses/BSD-3-Clause * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_ #define MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_ #include #include #include #include #include #include #include #include #include #include #include "pytorch_cpp_helper.hpp" #include "pytorch_device_registry.hpp" #include "torch_npu/csrc/aten/NPUNativeFunctions.h" #include "torch_npu/csrc/core/npu/NPUStream.h" #include "torch_npu/csrc/framework/OpCommand.h" #include "torch_npu/csrc/framework/interface/EnvVariables.h" #include "torch_npu/csrc/framework/utils/CalcuOpUtil.h" #include "torch_npu/csrc/framework/utils/OpPreparation.h" #define NPU_NAME_SPACE at_npu::native typedef struct aclOpExecutor aclOpExecutor; typedef struct aclTensor aclTensor; typedef struct aclScalar aclScalar; typedef struct aclIntArray aclIntArray; typedef struct aclFloatArray aclFloatArray; typedef struct aclBoolArray aclBoolArray; typedef struct aclTensorList aclTensorList; typedef aclTensor *(*_aclCreateTensor)( const int64_t *view_dims, uint64_t view_dims_num, aclDataType data_type, const int64_t *stride, int64_t offset, aclFormat format, const int64_t *storage_dims, uint64_t storage_dims_num, void *tensor_data); typedef aclScalar *(*_aclCreateScalar)(void *value, aclDataType data_type); typedef aclIntArray *(*_aclCreateIntArray)(const int64_t *value, uint64_t size); typedef aclFloatArray *(*_aclCreateFloatArray)(const float *value, uint64_t size); typedef aclBoolArray *(*_aclCreateBoolArray)(const bool *value, uint64_t size); typedef aclTensorList *(*_aclCreateTensorList)(const aclTensor *const *value, uint64_t size); typedef int (*_aclDestroyTensor)(const aclTensor *tensor); typedef int (*_aclDestroyScalar)(const aclScalar *scalar); typedef int (*_aclDestroyIntArray)(const aclIntArray *array); typedef int (*_aclDestroyFloatArray)(const aclFloatArray *array); typedef int (*_aclDestroyBoolArray)(const aclBoolArray *array); typedef int (*_aclDestroyTensorList)(const aclTensorList *array); constexpr int kHashBufSize = 8192; constexpr int kHashBufMaxSize = kHashBufSize + 1024; extern thread_local char g_hashBuf[kHashBufSize]; extern thread_local int g_hashOffset; #ifdef MMCV_WITH_XLA #define DEVICE_TYPE at_npu::key::NativeDeviceType #else #define DEVICE_TYPE c10::DeviceType::PrivateUse1 #endif #define AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(_) \ _(at::ScalarType::Byte, ACL_UINT8) \ _(at::ScalarType::Char, ACL_INT8) \ _(at::ScalarType::Short, ACL_INT16) \ _(at::ScalarType::Int, ACL_INT32) \ _(at::ScalarType::Long, ACL_INT64) \ _(at::ScalarType::Half, ACL_FLOAT16) \ _(at::ScalarType::Float, ACL_FLOAT) \ _(at::ScalarType::Double, ACL_DOUBLE) \ _(at::ScalarType::ComplexHalf, ACL_DT_UNDEFINED) \ _(at::ScalarType::ComplexFloat, ACL_COMPLEX64) \ _(at::ScalarType::ComplexDouble, ACL_COMPLEX128) \ _(at::ScalarType::Bool, ACL_BOOL) \ _(at::ScalarType::QInt8, ACL_DT_UNDEFINED) \ _(at::ScalarType::QUInt8, ACL_DT_UNDEFINED) \ _(at::ScalarType::QInt32, ACL_DT_UNDEFINED) \ _(at::ScalarType::BFloat16, ACL_BF16) \ _(at::ScalarType::QUInt4x2, ACL_DT_UNDEFINED) \ _(at::ScalarType::QUInt2x4, ACL_DT_UNDEFINED) \ _(at::ScalarType::Undefined, ACL_DT_UNDEFINED) \ _(at::ScalarType::NumOptions, ACL_DT_UNDEFINED) constexpr aclDataType kATenScalarTypeToAclDataTypeTable [static_cast(at::ScalarType::NumOptions) + 1] = { #define DEFINE_ENUM(_1, n) n, AT_ALL_SCALAR_TYPE_AND_ACL_DATATYPE_PAIR(DEFINE_ENUM) #undef DEFINE_ENUM }; #define GET_OP_API_FUNC(apiName) \ reinterpret_cast<_##apiName>(GetOpApiFuncAddr(#apiName)) #define MEMCPY_TO_BUF(data_expression, size_expression) \ if (g_hashOffset + (size_expression) > kHashBufSize) { \ g_hashOffset = kHashBufMaxSize; \ return; \ } \ memcpy(g_hashBuf + g_hashOffset, data_expression, size_expression); \ g_hashOffset += size_expression; inline const char *GetOpApiLibName(void) { return "libopapi.so"; } inline const char *GetCustOpApiLibName(void) { return "libcust_opapi.so"; } inline void *GetOpApiFuncAddrInLib(void *handler, const char *libName, const char *apiName) { auto funcAddr = dlsym(handler, apiName); if (funcAddr == nullptr) { ASCEND_LOGW("dlsym %s from %s failed, error:%s.", apiName, libName, dlerror()); } return funcAddr; } inline void *GetOpApiLibHandler(const char *libName) { auto handler = dlopen(libName, RTLD_LAZY); if (handler == nullptr) { ASCEND_LOGW("dlopen %s failed, error:%s.", libName, dlerror()); } return handler; } inline void *GetOpApiFuncAddr(const char *apiName) { static auto custOpApiHandler = GetOpApiLibHandler(GetCustOpApiLibName()); if (custOpApiHandler != nullptr) { auto funcAddr = GetOpApiFuncAddrInLib(custOpApiHandler, GetCustOpApiLibName(), apiName); if (funcAddr != nullptr) { return funcAddr; } } static auto opApiHandler = GetOpApiLibHandler(GetOpApiLibName()); if (opApiHandler == nullptr) { return nullptr; } return GetOpApiFuncAddrInLib(opApiHandler, GetOpApiLibName(), apiName); } inline c10::Scalar ConvertTensorToScalar(const at::Tensor &tensor) { c10::Scalar expScalar; const at::Tensor *aclInput = &tensor; if (aclInput->scalar_type() == at::ScalarType::Double) { double value = *(double *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } else if (aclInput->scalar_type() == at::ScalarType::Long) { int64_t value = *(int64_t *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } else if (aclInput->scalar_type() == at::ScalarType::Float) { float value = *(float *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } else if (aclInput->scalar_type() == at::ScalarType::Int) { int value = *(int *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } else if (aclInput->scalar_type() == at::ScalarType::Half) { c10::Half value = *(c10::Half *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } else if (aclInput->scalar_type() == at::ScalarType::Bool) { int8_t value = *(int8_t *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } else if (aclInput->scalar_type() == at::ScalarType::ComplexDouble) { c10::complex value = *(c10::complex *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } else if (aclInput->scalar_type() == at::ScalarType::ComplexFloat) { c10::complex value = *(c10::complex *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } else if (aclInput->scalar_type() == at::ScalarType::BFloat16) { c10::BFloat16 value = *(c10::BFloat16 *)aclInput->data_ptr(); c10::Scalar scalar(value); expScalar = scalar; } return expScalar; } inline at::Tensor CopyTensorHostToDevice(const at::Tensor &cpu_tensor) { at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory(); int deviceIndex = 0; return cpuPinMemTensor.to(c10::Device(DEVICE_TYPE, deviceIndex), cpuPinMemTensor.scalar_type(), true, true); } inline at::Tensor CopyScalarToDevice(const c10::Scalar &cpu_scalar, at::ScalarType scalar_data_type) { return CopyTensorHostToDevice( scalar_to_tensor(cpu_scalar).to(scalar_data_type)); } inline aclTensor *ConvertType(const at::Tensor &at_tensor) { static const auto aclCreateTensor = GET_OP_API_FUNC(aclCreateTensor); if (aclCreateTensor == nullptr) { return nullptr; } if (!at_tensor.defined()) { return nullptr; } at::ScalarType scalar_data_type = at_tensor.scalar_type(); aclDataType acl_data_type = kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; TORCH_CHECK( acl_data_type != ACL_DT_UNDEFINED, std::string(c10::toString(scalar_data_type)) + " has not been supported") c10::SmallVector storageDims; // if acl_data_type is ACL_STRING, storageDims is empty. auto itemsize = at_tensor.itemsize(); if (itemsize == 0) { AT_ERROR("When ConvertType, tensor item size of cannot be zero."); return nullptr; } if (acl_data_type != ACL_STRING) { storageDims.push_back(at_tensor.storage().nbytes() / itemsize); } const auto dimNum = at_tensor.sizes().size(); aclFormat format = ACL_FORMAT_ND; switch (dimNum) { case 3: format = ACL_FORMAT_NCL; break; case 4: format = ACL_FORMAT_NCHW; break; case 5: format = ACL_FORMAT_NCDHW; break; default: format = ACL_FORMAT_ND; } if (at_tensor.unsafeGetTensorImpl()->is_wrapped_number()) { c10::Scalar expScalar = ConvertTensorToScalar(at_tensor); at::Tensor aclInput = CopyScalarToDevice(expScalar, scalar_data_type); return aclCreateTensor(aclInput.sizes().data(), aclInput.sizes().size(), acl_data_type, aclInput.strides().data(), aclInput.storage_offset(), format, storageDims.data(), storageDims.size(), const_cast(aclInput.storage().data())); } auto acl_tensor = aclCreateTensor( at_tensor.sizes().data(), at_tensor.sizes().size(), acl_data_type, at_tensor.strides().data(), at_tensor.storage_offset(), format, storageDims.data(), storageDims.size(), const_cast(at_tensor.storage().data())); return acl_tensor; } inline aclScalar *ConvertType(const at::Scalar &at_scalar) { static const auto aclCreateScalar = GET_OP_API_FUNC(aclCreateScalar); if (aclCreateScalar == nullptr) { return nullptr; } at::ScalarType scalar_data_type = at_scalar.type(); aclDataType acl_data_type = kATenScalarTypeToAclDataTypeTable[static_cast(scalar_data_type)]; TORCH_CHECK( acl_data_type != ACL_DT_UNDEFINED, std::string(c10::toString(scalar_data_type)) + " has not been supported") aclScalar *acl_scalar = nullptr; switch (scalar_data_type) { case at::ScalarType::Double: { double value = at_scalar.toDouble(); acl_scalar = aclCreateScalar(&value, acl_data_type); break; } case at::ScalarType::Long: { int64_t value = at_scalar.toLong(); acl_scalar = aclCreateScalar(&value, acl_data_type); break; } case at::ScalarType::Bool: { bool value = at_scalar.toBool(); acl_scalar = aclCreateScalar(&value, acl_data_type); break; } case at::ScalarType::ComplexDouble: { auto value = at_scalar.toComplexDouble(); acl_scalar = aclCreateScalar(&value, acl_data_type); break; } default: acl_scalar = nullptr; break; } return acl_scalar; } inline aclIntArray *ConvertType(const at::IntArrayRef &at_array) { static const auto aclCreateIntArray = GET_OP_API_FUNC(aclCreateIntArray); if (aclCreateIntArray == nullptr) { return nullptr; } auto array = aclCreateIntArray(at_array.data(), at_array.size()); return array; } template inline aclBoolArray *ConvertType(const std::array &value) { static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); if (aclCreateBoolArray == nullptr) { return nullptr; } auto array = aclCreateBoolArray(value.data(), value.size()); return array; } inline aclBoolArray *ConvertType(const at::ArrayRef &value) { static const auto aclCreateBoolArray = GET_OP_API_FUNC(aclCreateBoolArray); if (aclCreateBoolArray == nullptr) { return nullptr; } auto array = aclCreateBoolArray(value.data(), value.size()); return array; } inline aclTensorList *ConvertType(const at::TensorList &at_tensor_list) { static const auto aclCreateTensorList = GET_OP_API_FUNC(aclCreateTensorList); if (aclCreateTensorList == nullptr) { return nullptr; } std::vector tensor_list(at_tensor_list.size()); for (size_t i = 0; i < at_tensor_list.size(); i++) { tensor_list[i] = ConvertType(at_tensor_list[i]); } auto acl_tensor_list = aclCreateTensorList(tensor_list.data(), tensor_list.size()); return acl_tensor_list; } inline aclTensor *ConvertType(const c10::optional &opt_tensor) { if (opt_tensor.has_value() && opt_tensor.value().defined()) { return ConvertType(opt_tensor.value()); } return nullptr; } inline aclIntArray *ConvertType( const c10::optional &opt_array) { if (opt_array.has_value()) { return ConvertType(opt_array.value()); } return nullptr; } inline aclScalar *ConvertType(const c10::optional &opt_scalar) { if (opt_scalar.has_value()) { return ConvertType(opt_scalar.value()); } return nullptr; } inline aclDataType ConvertType(const at::ScalarType scalarType) { return kATenScalarTypeToAclDataTypeTable[static_cast(scalarType)]; } template T ConvertType(T value) { return value; } template auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr, std::index_sequence) { typedef int (*OpApiFunc)( typename std::decay(params))>::type...); auto func = reinterpret_cast(opApiAddr); return func; } template auto ConvertToOpApiFunc(const Tuple ¶ms, void *opApiAddr) { static constexpr auto size = std::tuple_size::value; return ConvertToOpApiFunc(params, opApiAddr, std::make_index_sequence{}); } inline void Release(aclTensor *p) { static const auto aclDestroyTensor = GET_OP_API_FUNC(aclDestroyTensor); if (aclDestroyTensor == nullptr) { return; } aclDestroyTensor(p); } inline void Release(aclScalar *p) { static const auto aclDestroyScalar = GET_OP_API_FUNC(aclDestroyScalar); if (aclDestroyScalar == nullptr) { return; } aclDestroyScalar(p); } inline void Release(aclIntArray *p) { static const auto aclDestroyIntArray = GET_OP_API_FUNC(aclDestroyIntArray); if (aclDestroyIntArray == nullptr) { return; } aclDestroyIntArray(p); } inline void Release(aclBoolArray *p) { static const auto aclDestroyBoolArray = GET_OP_API_FUNC(aclDestroyBoolArray); if (aclDestroyBoolArray == nullptr) { return; } aclDestroyBoolArray(p); } inline void Release(aclTensorList *p) { static const auto aclDestroyTensorList = GET_OP_API_FUNC(aclDestroyTensorList); if (aclDestroyTensorList == nullptr) { return; } aclDestroyTensorList(p); } template void Release(T value) { (void)value; } template void CallRelease(Tuple t, std::index_sequence) { (void)std::initializer_list{(Release(std::get(t)), 0)...}; } template void ReleaseConvertTypes(Tuple &t) { static constexpr auto size = std::tuple_size::value; CallRelease(t, std::make_index_sequence{}); } template constexpr auto ConvertTypes(Ts &...args) { return std::make_tuple(ConvertType(args)...); } template auto call(Function f, Tuple t, std::index_sequence) { return f(std::get(t)...); } template auto call(Function f, Tuple t) { static constexpr auto size = std::tuple_size::value; return call(f, t, std::make_index_sequence{}); } template void AddParamToBuf(const std::array &value) { MEMCPY_TO_BUF(value.data(), value.size() * sizeof(bool)); } template void AddParamToBuf(const T &value) { MEMCPY_TO_BUF(&value, sizeof(T)); } void AddParamToBuf(const at::Tensor &); void AddParamToBuf(const at::Scalar &); void AddParamToBuf(const at::IntArrayRef &); void AddParamToBuf(const at::ArrayRef &); void AddParamToBuf(const at::TensorList &); void AddParamToBuf(const c10::optional &); void AddParamToBuf(const c10::optional &); void AddParamToBuf(const c10::optional &); void AddParamToBuf(const at::ScalarType); void AddParamToBuf(const string &); void AddParamToBuf(); template void AddParamToBuf(const T &arg, Args &...args) { AddParamToBuf(arg); AddParamToBuf(args...); } uint64_t CalcHashId(); typedef int (*InitHugeMemThreadLocal)(void *, bool); typedef void (*UnInitHugeMemThreadLocal)(void *, bool); typedef void (*ReleaseHugeMem)(void *, bool); #define EXEC_NPU_CMD(aclnn_api, ...) \ do { \ static const auto getWorkspaceSizeFuncAddr = \ GetOpApiFuncAddr(#aclnn_api "GetWorkspaceSize"); \ static const auto opApiFuncAddr = GetOpApiFuncAddr(#aclnn_api); \ static const auto initMemAddr = \ GetOpApiFuncAddr("InitHugeMemThreadLocal"); \ static const auto unInitMemAddr = \ GetOpApiFuncAddr("UnInitHugeMemThreadLocal"); \ static const auto releaseMemAddr = GetOpApiFuncAddr("ReleaseHugeMem"); \ TORCH_CHECK( \ getWorkspaceSizeFuncAddr != nullptr && opApiFuncAddr != nullptr, \ #aclnn_api, " or ", #aclnn_api "GetWorkspaceSize", " not in ", \ GetOpApiLibName(), ", or ", GetOpApiLibName(), "not found."); \ auto acl_stream = c10_npu::getCurrentNPUStream().stream(false); \ uint64_t workspace_size = 0; \ uint64_t *workspace_size_addr = &workspace_size; \ aclOpExecutor *executor = nullptr; \ aclOpExecutor **executor_addr = &executor; \ InitHugeMemThreadLocal initMemFunc = \ reinterpret_cast(initMemAddr); \ UnInitHugeMemThreadLocal unInitMemFunc = \ reinterpret_cast(unInitMemAddr); \ if (initMemFunc) { \ initMemFunc(nullptr, false); \ } \ auto converted_params = \ ConvertTypes(__VA_ARGS__, workspace_size_addr, executor_addr); \ static auto getWorkspaceSizeFunc = \ ConvertToOpApiFunc(converted_params, getWorkspaceSizeFuncAddr); \ auto workspace_status = call(getWorkspaceSizeFunc, converted_params); \ TORCH_CHECK(workspace_status == 0, \ "call " #aclnn_api " failed, detail:", aclGetRecentErrMsg()); \ void *workspace_addr = nullptr; \ if (workspace_size != 0) { \ at::TensorOptions options = \ at::TensorOptions(torch_npu::utils::get_npu_device_type()); \ auto workspace_tensor = \ at::empty({workspace_size}, options.dtype(kByte)); \ workspace_addr = const_cast(workspace_tensor.storage().data()); \ } \ auto acl_call = [converted_params, workspace_addr, workspace_size, \ acl_stream, executor]() -> int { \ typedef int (*OpApiFunc)(void *, uint64_t, aclOpExecutor *, \ const aclrtStream); \ OpApiFunc opApiFunc = reinterpret_cast(opApiFuncAddr); \ auto api_ret = \ opApiFunc(workspace_addr, workspace_size, executor, acl_stream); \ TORCH_CHECK(api_ret == 0, "call " #aclnn_api " failed, detail:", \ aclGetRecentErrMsg()); \ ReleaseConvertTypes(converted_params); \ ReleaseHugeMem releaseMemFunc = \ reinterpret_cast(releaseMemAddr); \ if (releaseMemFunc) { \ releaseMemFunc(nullptr, false); \ } \ return api_ret; \ }; \ at_npu::native::OpCommand cmd; \ cmd.Name(#aclnn_api); \ cmd.SetCustomHandler(acl_call); \ cmd.Run(); \ if (unInitMemFunc) { \ unInitMemFunc(nullptr, false); \ } \ } while (false) #endif // MMCV_OPS_CSRC_COMMON_PYTORCH_NPU_UTIL_HPP_ ================================================ FILE: mmcv/ops/csrc/common/utils/spconv/paramsgrid.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef PARAMS_GRID_H_ #define PARAMS_GRID_H_ #include #include namespace detail { template int getTotalSize(std::vector arg) { return arg.size(); } template int getTotalSize(std::vector arg, std::vector... args) { return arg.size() * getTotalSize(args...); } template int getSize(std::vector arg) { return arg.size(); } template void assigner(TT &src, std::vector counter, std::vector &arg) { std::get(src) = arg[counter[Idx]]; } template void assigner(TT &src, std::vector counter, std::vector &arg, std::vector &...args) { std::get(src) = arg[counter[Idx]]; assigner(src, counter, args...); } } // namespace detail template std::vector> paramsGrid(std::vector... args) { int length = detail::getTotalSize(args...); std::vector sizes = {detail::getSize(args)...}; int size = sizes.size(); std::vector> params(length); std::vector counter(size); for (int i = 0; i < length; ++i) { detail::assigner<0>(params[i], counter, args...); counter[size - 1] += 1; for (int c = size - 1; c >= 0; --c) { if (counter[c] == sizes[c] && c > 0) { counter[c - 1] += 1; counter[c] = 0; } } } return params; } #endif ================================================ FILE: mmcv/ops/csrc/common/utils/spconv/prettyprint.h ================================================ // Copyright Louis Delacroix 2010 - 2014. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // // A pretty printing library for C++ // // Usage: // Include this header, and operator<< will "just work". #ifndef H_PRETTY_PRINT #define H_PRETTY_PRINT #include #include #include #include #include #include #include #include #include #include namespace pretty_print { namespace detail { // SFINAE type trait to detect whether T::const_iterator exists. struct sfinae_base { using yes = char; using no = yes[2]; }; template struct has_const_iterator : private sfinae_base { private: template static yes &test(typename C::const_iterator *); template static no &test(...); public: static const bool value = sizeof(test(nullptr)) == sizeof(yes); using type = T; }; template struct has_begin_end : private sfinae_base { private: template static yes & f(typename std::enable_if< std::is_same(&C::begin)), typename C::const_iterator (C::*)() const>::value>::type *); template static no &f(...); template static yes &g(typename std::enable_if< std::is_same(&C::end)), typename C::const_iterator (C::*)() const>::value, void>::type *); template static no &g(...); public: static bool const beg_value = sizeof(f(nullptr)) == sizeof(yes); static bool const end_value = sizeof(g(nullptr)) == sizeof(yes); }; } // namespace detail // Holds the delimiter values for a specific character type template struct delimiters_values { using char_type = TChar; const char_type *prefix; const char_type *delimiter; const char_type *postfix; }; // Defines the delimiter values for a specific container and character type template struct delimiters { using type = delimiters_values; static const type values; }; // Functor to print containers. You can use this directly if you want // to specify a non-default delimiters type. The printing logic can // be customized by specializing the nested template. template , typename TDelimiters = delimiters> struct print_container_helper { using delimiters_type = TDelimiters; using ostream_type = std::basic_ostream; template struct printer { static void print_body(const U &c, ostream_type &stream) { using std::begin; using std::end; auto it = begin(c); const auto the_end = end(c); if (it != the_end) { for (;;) { stream << *it; if (++it == the_end) break; if (delimiters_type::values.delimiter != NULL) stream << delimiters_type::values.delimiter; } } } }; print_container_helper(const T &container) : container_(container) {} inline void operator()(ostream_type &stream) const { if (delimiters_type::values.prefix != NULL) stream << delimiters_type::values.prefix; printer::print_body(container_, stream); if (delimiters_type::values.postfix != NULL) stream << delimiters_type::values.postfix; } private: const T &container_; }; // Specialization for pairs template template struct print_container_helper::printer> { using ostream_type = typename print_container_helper::ostream_type; static void print_body(const std::pair &c, ostream_type &stream) { stream << c.first; if (print_container_helper::delimiters_type::values .delimiter != NULL) stream << print_container_helper::delimiters_type::values .delimiter; stream << c.second; } }; // Specialization for tuples template template struct print_container_helper::printer> { using ostream_type = typename print_container_helper::ostream_type; using element_type = std::tuple; template struct Int {}; static void print_body(const element_type &c, ostream_type &stream) { tuple_print(c, stream, Int<0>()); } static void tuple_print(const element_type &, ostream_type &, Int) {} static void tuple_print( const element_type &c, ostream_type &stream, typename std::conditional, std::nullptr_t>::type) { stream << std::get<0>(c); tuple_print(c, stream, Int<1>()); } template static void tuple_print(const element_type &c, ostream_type &stream, Int) { if (print_container_helper::delimiters_type::values .delimiter != NULL) stream << print_container_helper::delimiters_type::values .delimiter; stream << std::get(c); tuple_print(c, stream, Int()); } }; // Prints a print_container_helper to the specified stream. template inline std::basic_ostream &operator<<( std::basic_ostream &stream, const print_container_helper &helper) { helper(stream); return stream; } // Basic is_container template; specialize to derive from std::true_type for all // desired container types template struct is_container : public std::integral_constant::value && detail::has_begin_end::beg_value && detail::has_begin_end::end_value> {}; template struct is_container : std::true_type {}; template struct is_container : std::false_type {}; template struct is_container> : std::true_type {}; template struct is_container> : std::true_type {}; template struct is_container> : std::true_type {}; // Default delimiters template struct delimiters { static const delimiters_values values; }; template const delimiters_values delimiters::values = {"[", ", ", "]"}; template struct delimiters { static const delimiters_values values; }; template const delimiters_values delimiters::values = {L"[", L", ", L"]"}; // Delimiters for (multi)set and unordered_(multi)set template struct delimiters<::std::set, char> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::set, char>::values = {"{", ", ", "}"}; template struct delimiters<::std::set, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::set, wchar_t>::values = { L"{", L", ", L"}"}; template struct delimiters<::std::multiset, char> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::multiset, char>::values = { "{", ", ", "}"}; template struct delimiters<::std::multiset, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::multiset, wchar_t>::values = { L"{", L", ", L"}"}; template struct delimiters<::std::unordered_set, char> { static const delimiters_values values; }; template const delimiters_values delimiters< ::std::unordered_set, char>::values = { "{", ", ", "}"}; template struct delimiters<::std::unordered_set, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters< ::std::unordered_set, wchar_t>::values = { L"{", L", ", L"}"}; template struct delimiters<::std::unordered_multiset, char> { static const delimiters_values values; }; template const delimiters_values delimiters< ::std::unordered_multiset, char>::values = { "{", ", ", "}"}; template struct delimiters<::std::unordered_multiset, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::unordered_multiset, wchar_t>::values = {L"{", L", ", L"}"}; // Delimiters for pair and tuple template struct delimiters, char> { static const delimiters_values values; }; template const delimiters_values delimiters, char>::values = { "(", ", ", ")"}; template struct delimiters<::std::pair, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::pair, wchar_t>::values = {L"(", L", ", L")"}; template struct delimiters, char> { static const delimiters_values values; }; template const delimiters_values delimiters, char>::values = { "(", ", ", ")"}; template struct delimiters<::std::tuple, wchar_t> { static const delimiters_values values; }; template const delimiters_values delimiters<::std::tuple, wchar_t>::values = {L"(", L", ", L")"}; // Type-erasing helper class for easy use of custom delimiters. // Requires TCharTraits = std::char_traits and TChar = char or wchar_t, // and MyDelims needs to be defined for TChar. Usage: "cout << // pretty_print::custom_delims(x)". struct custom_delims_base { virtual ~custom_delims_base() {} virtual std::ostream &stream(::std::ostream &) = 0; virtual std::wostream &stream(::std::wostream &) = 0; }; template struct custom_delims_wrapper : custom_delims_base { custom_delims_wrapper(const T &t_) : t(t_) {} std::ostream &stream(std::ostream &s) { return s << print_container_helper, Delims>( t); } std::wostream &stream(std::wostream &s) { return s << print_container_helper, Delims>(t); } private: const T &t; }; template struct custom_delims { template custom_delims(const Container &c) : base(new custom_delims_wrapper(c)) {} std::unique_ptr base; }; template inline std::basic_ostream &operator<<( std::basic_ostream &s, const custom_delims &p) { return p.base->stream(s); } // A wrapper for a C-style array given as pointer-plus-size. // Usage: std::cout << pretty_print_array(arr, n) << std::endl; template struct array_wrapper_n { typedef const T *const_iterator; typedef T value_type; array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {} inline const_iterator begin() const { return _array; } inline const_iterator end() const { return _array + _n; } private: const T *const _array; size_t _n; }; // A wrapper for hash-table based containers that offer local iterators to each // bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket // 5 of container m.) template struct bucket_print_wrapper { typedef typename T::const_local_iterator const_iterator; typedef typename T::size_type size_type; const_iterator begin() const { return m_map.cbegin(n); } const_iterator end() const { return m_map.cend(n); } bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {} private: const T &m_map; const size_type n; }; } // namespace pretty_print // Global accessor functions for the convenience wrappers template inline pretty_print::array_wrapper_n pretty_print_array(const T *const a, size_t n) { return pretty_print::array_wrapper_n(a, n); } template pretty_print::bucket_print_wrapper bucket_print(const T &m, typename T::size_type n) { return pretty_print::bucket_print_wrapper(m, n); } // Main magic entry point: An overload snuck into namespace std. // Can we do better? namespace std { // Prints a container to the stream using default delimiters template inline typename enable_if<::pretty_print::is_container::value, basic_ostream &>::type operator<<(basic_ostream &stream, const T &container) { return stream << ::pretty_print::print_container_helper( container); } } // namespace std #endif // H_PRETTY_PRINT ================================================ FILE: mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include #include #include #include #include #include namespace py = pybind11; template std::vector array2Vector(TPyObject arr) { py::array arr_np = arr; size_t size = arr.attr("size").template cast(); py::array_t arr_cc = arr_np; std::vector data(arr_cc.data(), arr_cc.data() + size); return data; } template std::vector arrayT2Vector(py::array_t arr) { std::vector data(arr.data(), arr.data() + arr.size()); return data; } template tv::TensorView array2TensorView(TPyObject arr) { py::array arr_np = arr; py::array_t arr_cc = arr_np; tv::Shape shape; for (int i = 0; i < arr_cc.ndim(); ++i) { shape.push_back(arr_cc.shape(i)); } return tv::TensorView(arr_cc.mutable_data(), shape); } template tv::TensorView arrayT2TensorView(py::array_t arr) { tv::Shape shape; for (int i = 0; i < arr.ndim(); ++i) { shape.push_back(arr.shape(i)); } return tv::TensorView(arr.mutable_data(), shape); } ================================================ FILE: mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef SPCONV_GEOMETRY_H_ #define SPCONV_GEOMETRY_H_ #include #include #include template TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos, const Index *kernelSize, const Index *stride, const Index *padding, const Index *dilation, const Index *outSpatialShape, Index *out) { Index lowers[NDim]; Index uppers[NDim]; Index counter[NDim]; Index counterSize[NDim]; Index pointCounter = 0; Index val; Index numPoints = 1; Index m, offset; bool valid = false; #pragma unroll for (unsigned i = 0; i < NDim; ++i) { lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + stride[i] + padding[i]) / stride[i]; uppers[i] = (input_pos[i] + padding[i]) / stride[i]; } #pragma unroll for (unsigned i = 0; i < NDim; ++i) { counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); numPoints *= counterSize[i]; } #pragma unroll for (unsigned i = 0; i < NDim; ++i) { counter[i] = 0; } for (int i = 0; i < numPoints; ++i) { valid = true; m = 1; offset = 0; #pragma unroll for (int j = NDim - 1; j >= 0; --j) { val = uppers[j] - counter[j] * dilation[j]; out[pointCounter * (NDim + 1) + j] = val; if (val < 0 || (val > outSpatialShape[j] - 1)) { valid = false; // break; } offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j]; m *= kernelSize[j]; } out[pointCounter * (NDim + 1) + NDim] = offset; if (valid) ++pointCounter; counter[NDim - 1] += 1; #pragma unroll for (int c = NDim - 1; c >= 0; --c) { if (counter[c] == counterSize[c] && c > 0) { counter[c - 1] += 1; counter[c] = 0; } } } return pointCounter; } template TV_HOST_DEVICE Index getValidOutPosTranspose( const Index *input_pos, const Index *kernelSize, const Index *stride, const Index *padding, const Index *dilation, const Index *outSpatialShape, Index *out) { Index lowers[NDim]; Index uppers[NDim]; Index counter[NDim]; Index counterSize[NDim]; Index pointCounter = 0; Index val; Index numPoints = 1; Index m, offset; bool valid = false; #pragma unroll for (unsigned i = 0; i < NDim; ++i) { lowers[i] = input_pos[i] * stride[i] - padding[i]; uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i]; } #pragma unroll for (unsigned i = 0; i < NDim; ++i) { counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); numPoints *= counterSize[i]; } #pragma unroll for (unsigned i = 0; i < NDim; ++i) { counter[i] = 0; } for (int i = 0; i < numPoints; ++i) { valid = true; m = 1; offset = 0; #pragma unroll for (int j = NDim - 1; j >= 0; --j) { val = uppers[j] - counter[j] * dilation[j]; out[pointCounter * (NDim + 1) + j] = val; if (val < 0 || (val > outSpatialShape[j] - 1)) { valid = false; } offset += m * (val - lowers[j]) / dilation[j]; m *= kernelSize[j]; } out[pointCounter * (NDim + 1) + NDim] = offset; if (valid) ++pointCounter; counter[NDim - 1] += 1; #pragma unroll for (int c = NDim - 1; c >= 0; --c) { if (counter[c] == counterSize[c] && c > 0) { counter[c - 1] += 1; counter[c] = 0; } } } return pointCounter; } template Index getIndicePairsConv(tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const Index *kernelSize, const Index *stride, const Index *padding, const Index *dilation, const Index *outSpatialShape) { // indicesOut: num_active * kernelVolume * (NDim + 1) Index numAct = 0; auto numActIn = indicesIn.dim(0); Index batchIdx = 0; Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; std::vector validPoints_(kernelVolume * (NDim + 1)); Index *validPoints = validPoints_.data(); Index *pointPtr = nullptr; for (int j = 0; j < numActIn; ++j) { batchIdx = indicesIn(j, 0); numValidPoints = getValidOutPos( indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, dilation, outSpatialShape, validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + spatialVolume * batchIdx; if (gridsOut[index] == -1) { for (unsigned k = 1; k < NDim + 1; ++k) { indicesOut(numAct, k) = pointPtr[k - 1]; } indicesOut(numAct, 0) = batchIdx; gridsOut[index] = numAct++; } // indicePairs: [K, 2, L] indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; } } return numAct; } template Index getIndicePairsDeConv(tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const Index *kernelSize, const Index *stride, const Index *padding, const Index *dilation, const Index *outSpatialShape) { Index numAct = 0; auto numActIn = indicesIn.dim(0); Index batchIdx = 0; Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; std::vector validPoints_(kernelVolume * (NDim + 1)); Index *validPoints = validPoints_.data(); Index *pointPtr = nullptr; for (int j = 0; j < numActIn; ++j) { batchIdx = indicesIn(j, 0); numValidPoints = getValidOutPosTranspose( indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, dilation, outSpatialShape, validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + spatialVolume * batchIdx; if (gridsOut[index] == -1) { for (unsigned k = 1; k < NDim + 1; ++k) { indicesOut(numAct, k) = pointPtr[k - 1]; } indicesOut(numAct, 0) = batchIdx; gridsOut[index] = numAct++; } // indicePairs: [K, 2, L] indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; } } return numAct; } template Index getIndicePairsSubM(tv::TensorView indicesIn, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const Index *const kernelSize, const Index *const stride, const Index *const padding, const Index *dilation, const Index *const outSpatialShape) { auto numActIn = indicesIn.dim(0); Index spatialVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { spatialVolume *= outSpatialShape[i]; } Index kernelVolume = 1; #pragma unroll for (int i = 0; i < NDim; ++i) { kernelVolume *= kernelSize[i]; } Index numValidPoints = 0; // Index validPoints[kernelVolume * (NDim + 1)]; std::vector validPoints_(kernelVolume * (NDim + 1)); Index *validPoints = validPoints_.data(); Index *pointPtr = nullptr; Index index = 0; for (int j = 0; j < numActIn; ++j) { index = tv::rowArrayIdx(indicesIn.data() + j * (NDim + 1) + 1, outSpatialShape) + spatialVolume * indicesIn(j, 0); gridsOut[index] = j; } for (int j = 0; j < numActIn; ++j) { numValidPoints = getValidOutPos( indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, dilation, outSpatialShape, validPoints); for (Index i = 0; i < numValidPoints; ++i) { pointPtr = validPoints + i * (NDim + 1); auto offset = pointPtr[NDim]; index = tv::rowArrayIdx(pointPtr, outSpatialShape) + spatialVolume * indicesIn(j, 0); if (gridsOut[index] > -1) { indicePairs(offset, 0, indiceNum[offset]) = j; indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; } } } return numActIn; } #endif ================================================ FILE: mmcv/ops/csrc/common/utils/spconv/spconv/indice.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef SPARSE_CONV_INDICE_FUNCTOR_H_ #define SPARSE_CONV_INDICE_FUNCTOR_H_ #include namespace functor { template struct CreateConvIndicePairFunctorP1 { Index operator()(const Device& d, tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape, bool transpose); }; template struct CreateConvIndicePairFunctorP2 { Index operator()(const Device& d, tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, tv::TensorView indicePairUnique, const tv::SimpleVector outSpatialShape, bool transpose, bool resetGrid = false); }; template struct CreateConvIndicePairFunctor { Index operator()(const Device& d, tv::TensorView indicesIn, tv::TensorView indicesOut, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape, bool transpose, bool resetGrid = false); }; template struct CreateSubMIndicePairFunctor { Index operator()(const Device& d, tv::TensorView indicesIn, tv::TensorView gridsOut, tv::TensorView indicePairs, tv::TensorView indiceNum, const tv::SimpleVector kernelSize, const tv::SimpleVector stride, const tv::SimpleVector padding, const tv::SimpleVector dilation, const tv::SimpleVector outSpatialShape, bool transpose, bool resetGrid = false); }; } // namespace functor #endif ================================================ FILE: mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h ================================================ // Copyright 2019 Yan Yan // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef SPARSE_MAXPOOL_FUNCTOR_H_ #define SPARSE_MAXPOOL_FUNCTOR_H_ #include namespace functor { template struct SparseMaxPoolForwardFunctor { void operator()(const Device& d, tv::TensorView outFeatures, tv::TensorView inFeatures, tv::TensorView indices, int size); }; template struct SparseMaxPoolBackwardFunctor { void operator()(const Device& d, tv::TensorView outFeatures, tv::TensorView inFeatures, tv::TensorView fout, tv::TensorView fin, tv::TensorView indices, int size); }; } // namespace functor #endif ================================================ FILE: mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h ================================================ #ifndef MP_HELPER_H_ #define MP_HELPER_H_ #include #include template struct mp_list {}; template using mp_list_c = mp_list...>; namespace detail { template constexpr F mp_for_each_impl(mp_list, F &&f) { return std::initializer_list{(f(T()), 0)...}, std::forward(f); } template constexpr F mp_for_each_impl(mp_list<>, F &&f) { return std::forward(f); } } // namespace detail namespace detail { template class B> struct mp_rename_impl { // An error "no type named 'type'" here means that the first argument to // mp_rename is not a list }; template